Repository: NVIDIA-Merlin/HierarchicalKV
Branch: master
Commit: ae24eecde0b4
Files: 104
Total size: 2.0 MB

Directory structure:
gitextract_3c35qd95/

├── .bazeliskrc
├── .bazelrc
├── .clang-format
├── .github/
│   └── workflows/
│       ├── blossom-ci.yml
│       ├── docs-build.yaml
│       ├── docs-preview-pr.yaml
│       ├── docs-remove-stale-reviews.yaml
│       └── docs-sched-rebuild.yaml
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── STYLE_GUIDE.md
├── WORKSPACE
├── bazel_build.sh
├── benchmark/
│   ├── BUILD
│   ├── benchmark_util.cuh
│   ├── dual_bucket_benchmark.cc.cu
│   ├── find_with_missed_keys_benchmark.cc.cu
│   └── merlin_hashtable_benchmark.cc.cu
├── build_deps/
│   ├── gpus/
│   │   ├── BUILD
│   │   ├── check_cuda_libs.py
│   │   ├── configure.bzl
│   │   ├── crosstool/
│   │   │   ├── BUILD
│   │   │   ├── BUILD.tpl
│   │   │   ├── cc_toolchain_config.bzl.tpl
│   │   │   └── crosstool_compiler_wrapper.tpl
│   │   ├── cuda/
│   │   │   ├── BUILD
│   │   │   ├── BUILD.tpl
│   │   │   ├── build_defs.bzl.tpl
│   │   │   ├── cuda_config.h.tpl
│   │   │   └── cuda_config.py.tpl
│   │   └── find_cuda_config.py
│   └── remote_config/
│       ├── BUILD
│       ├── BUILD.tpl
│       ├── common.bzl
│       └── remote_platform_configure.bzl
├── cmake/
│   └── modules/
│       └── ClangFormat.cmake
├── docs/
│   ├── Makefile
│   ├── README.md
│   ├── make.bat
│   ├── requirements-doc.txt
│   └── source/
│       ├── _static/
│       │   ├── .gitkeep
│       │   └── css/
│       │       ├── banner.css
│       │       └── custom.css
│       ├── _templates/
│       │   ├── footer.html
│       │   └── versions.html
│       ├── conf.py
│       ├── index.rst
│       └── toc.yaml
├── include/
│   ├── BUILD
│   ├── merlin/
│   │   ├── BUILD
│   │   ├── allocator.cuh
│   │   ├── array_kernels.cuh
│   │   ├── core_kernels/
│   │   │   ├── BUILD
│   │   │   ├── accum_or_assign.cuh
│   │   │   ├── contains.cuh
│   │   │   ├── dual_bucket_lookup.cuh
│   │   │   ├── dual_bucket_upsert.cuh
│   │   │   ├── dual_bucket_utils.cuh
│   │   │   ├── find_or_insert.cuh
│   │   │   ├── find_ptr_or_insert.cuh
│   │   │   ├── group_lock_kernels.cuh
│   │   │   ├── kernel_utils.cuh
│   │   │   ├── lookup.cuh
│   │   │   ├── lookup_ptr.cuh
│   │   │   ├── update.cuh
│   │   │   ├── update_score.cuh
│   │   │   ├── update_values.cuh
│   │   │   ├── upsert.cuh
│   │   │   └── upsert_and_evict.cuh
│   │   ├── core_kernels.cuh
│   │   ├── debug.hpp
│   │   ├── flexible_buffer.cuh
│   │   ├── group_lock.cuh
│   │   ├── memory_pool.cuh
│   │   ├── multi_vector.hpp
│   │   ├── optimizers.cuh
│   │   ├── types.cuh
│   │   └── utils.cuh
│   ├── merlin_hashtable.cuh
│   └── merlin_localfile.hpp
├── run_all_tests.sh
└── tests/
    ├── accum_or_assign_test.cc.cu
    ├── assign_score_test.cc.cu
    ├── assign_values_test.cc.cu
    ├── dual_bucket_test.cc.cu
    ├── dynamic_max_capacity_test.cc.cu
    ├── export_batch_if_test.cc.cu
    ├── find_or_insert_ptr_lock_test.cc.cu
    ├── find_or_insert_ptr_test.cc.cu
    ├── find_or_insert_test.cc.cu
    ├── find_with_missed_keys_test.cc.cu
    ├── group_lock_test.cc.cu
    ├── insert_and_evict_test.cc.cu
    ├── lock_unlock_test.cc.cu
    ├── memory_pool_test.cc.cu
    ├── merlin_hashtable_test.cc.cu
    ├── reserved_keys_test.cc.cu
    ├── save_and_load_test.cc.cu
    ├── test_util.cuh
    └── uint32_score_test.cc.cu

================================================
FILE CONTENTS
================================================

================================================
FILE: .bazeliskrc
================================================
USE_BAZEL_VERSION=5.0.0


================================================
FILE: .bazelrc
================================================
build -c opt
build --copt -O3
build --copt -pthread
build --linkopt -pthread
build --linkopt -ldl
build --incompatible_linkopts_to_linklibs
build --copt -g --strip=never
build --experimental_repo_remote_exec

# By default, build HKV in C++ 17 mode.
build --cxxopt=-std=c++17
build --host_cxxopt=-std=c++17

# This config refers to building CUDA kernels with nvcc.
build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain

# CUDA options
build:cuda --action_env GCC_HOST_COMPILER_PATH="/usr/bin/gcc"
build:cuda --action_env CUDA_TOOLKIT_PATH="/usr/local/cuda"
build:cuda --action_env CUDA_VERSION="11"
build:cuda --action_env CUDNN_VERSION="8"
build:cuda --action_env CUDNN_INSTALL_PATH="/usr/"
build:cuda --action_env CUDA_COMPUTE_CAPABILITIES="7.5"


================================================
FILE: .clang-format
================================================
BasedOnStyle: Google
DerivePointerAlignment: false
IncludeBlocks: Merge
SortIncludes: true


================================================
FILE: .github/workflows/blossom-ci.yml
================================================
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# A workflow to trigger ci on hybrid infra (github + self hosted runner)
name: Blossom-CI
on:
  issue_comment:
    types: [created]
  workflow_dispatch:
      inputs:
          platform:
            description: 'runs-on argument'     
            required: false
          args:
            description: 'argument'     
            required: false
jobs:
  Authorization:
    name: Authorization
    runs-on: blossom 
    outputs:
      args: ${{ env.args }}
      
    # This job only runs for pull request comments
    if: |
         (github.actor == 'EmmaQiaoCh' || github.actor == 'rhdong' || github.actor == 'Ranjeet-Nvidia' ||  github.actor == 'jiashuy') &&
         github.event.comment.body == '/blossom-ci'  
    steps:
      - name: Check if comment is issued by authorized person
        run: blossom-ci
        env:
          OPERATION: 'AUTH'
          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
        
  Vulnerability-scan:
    name: Vulnerability scan
    needs: [Authorization]
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v2
        with:
          repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
          ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
          lfs: 'true'
         
      - name: Run blossom action
        uses: NVIDIA/blossom-action@main
        env:
          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
        with:
          args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
          args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
          args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
          
  Job-trigger:
    name: Start ci job
    needs: [Vulnerability-scan]
    runs-on: blossom
    steps:
      - name: Start ci job
        run: blossom-ci
        env:
          OPERATION: 'START-CI-JOB'
          CI_SERVER: ${{ secrets.CI_SERVER }}
          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
              
  Upload-Log:
    name: Upload log
    runs-on: blossom
    if : github.event_name == 'workflow_dispatch'
    steps:
      - name: Jenkins log for pull request ${{ fromJson(github.event.inputs.args).pr }} (click here)
        run: blossom-ci
        env:
          OPERATION: 'POST-PROCESSING'
          CI_SERVER: ${{ secrets.CI_SERVER }}
          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}


================================================
FILE: .github/workflows/docs-build.yaml
================================================
name: docs-build

on:
  pull_request:
    branches: [master]

jobs:
  build:
    runs-on: "ubuntu-latest"

    steps:
      - uses: actions/checkout@v3
      - name: Set up Python 3.8
        uses: actions/setup-python@v4
        with:
          python-version: '3.8'
      - name: Install Ubuntu packages
        run: |
          sudo apt-get update -y
          sudo apt-get install -y --no-install-recommends doxygen
      - name: Install dependencies
        run: |
          python -m pip install -r docs/requirements-doc.txt
      - name: Building docs
        run: |
          make -C docs html
      - name: Upload HTML
        uses: actions/upload-artifact@v4
        with:
          name: html-build-artifact
          path: docs/build/html
          if-no-files-found: error
          retention-days: 1
      - name: Store PR information
        run: |
          mkdir ./pr
          echo ${{ github.event.number }}              > ./pr/pr.txt
          echo ${{ github.event.pull_request.merged }} > ./pr/merged.txt
          echo ${{ github.event.action }}              > ./pr/action.txt
      - name: Upload PR information
        uses: actions/upload-artifact@v4
        with:
          name: pr
          path: pr/


================================================
FILE: .github/workflows/docs-preview-pr.yaml
================================================
name: docs-preview-pr

on:
  workflow_run:
    workflows: [docs-build]
    types: [completed]

env:
  WF_ID: ${{ github.event.workflow_run.id }}

jobs:
  preview:
    uses: nvidia-merlin/.github/.github/workflows/docs-preview-pr-common.yaml@main

================================================
FILE: .github/workflows/docs-remove-stale-reviews.yaml
================================================
name: docs-remove-stale-reviews

on:
  schedule:
    # 42 minutes after 0:00 UTC on Sundays
    - cron: "42 0 * * 0"
  workflow_dispatch:

jobs:
  remove:
    uses: nvidia-merlin/.github/.github/workflows/docs-remove-stale-reviews-common.yaml@main


================================================
FILE: .github/workflows/docs-sched-rebuild.yaml
================================================
name: docs-sched-rebuild

on:
  push:
    branches: [master]
    tags:
      - v*
  workflow_dispatch:

jobs:
  build:
    runs-on: "ubuntu-latest"

    steps:
      - uses: actions/checkout@v3
        with:
          fetch-depth: 0
      - name: Set up Python 3.8
        uses: actions/setup-python@v4
        with:
          python-version: 3.8
      - name: Install Ubuntu packages
        run: |
          sudo apt-get update -y
          sudo apt-get install -y doxygen
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          python -m pip install -r docs/requirements-doc.txt
      - name: Report the versions to build
        run: |
          sphinx-multiversion -D 'exhale_args.containmentFolder=${sourcedir}/api' --dump-metadata docs/source docs/build/html | jq "keys"
      - name: Building docs (multiversion)
        run: |
          sphinx-multiversion -D 'exhale_args.containmentFolder=${sourcedir}/api' docs/source docs/build/html
      - name: Delete unnecessary files
        run: |
          find docs/build -name .doctrees -prune -exec rm -rf {} \;
          find docs/build -name .buildinfo -exec rm {} \;
      - name: Upload HTML
        uses: actions/upload-artifact@v4
        with:
          name: html-build-artifact
          path: docs/build/html
          if-no-files-found: error
          retention-days: 1

  # Identify the dir for the HTML.
  store-html:
    needs: [build]
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
        with:
          ref: "gh-pages"
      - name: Initialize Git configuration
        run: |
          git config user.name docs-sched-rebuild
          git config user.email do-not-send-@github.com
      - name: Download artifacts
        uses: actions/download-artifact@v4
        with:
          name: html-build-artifact
      - name: Copy HTML directories
        run: |
          ls -asl
          for i in `ls -d *`
          do
            echo "Git adding ${i}"
            git add "${i}"
          done
      - name: Check or create dot-no-jekyll file
        run: |
          if [ -f ".nojekyll" ]; then
            echo "The dot-no-jekyll file already exists."
            exit 0
          fi
          touch .nojekyll
          git add .nojekyll
      - name: Check or create redirect page
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          resp=$(grep 'http-equiv="refresh"' index.html 2>/dev/null) || true
          if [ -n "${resp}" ]; then
            echo "The redirect file already exists."
            exit 0
          fi
          # If any of these commands fail, fail the build.
          def_branch=$(gh api "repos/${GITHUB_REPOSITORY}" --jq ".default_branch")
          html_url=$(gh api "repos/${GITHUB_REPOSITORY}/pages" --jq ".html_url")
          # Beware ugly quotation mark avoidance in the foll lines.
          echo '<!DOCTYPE html>'                                                                         > index.html
          echo '<html>'                                                                                 >> index.html
          echo '  <head>'                                                                               >> index.html
          echo '    <title>Redirect to documentation</title>'                                           >> index.html
          echo '    <meta charset="utf-8">'                                                             >> index.html
          echo '    <meta http=equiv="refresh" content="3; URL='${html_url}${def_branch}'/index.html">' >> index.html
          echo '    <link rel="canonical" href="'${html_url}${def_branch}'/index.html">'                >> index.html
          echo '    <script language="javascript">'                                                     >> index.html
          echo '      function redirect() {'                                                            >> index.html
          echo '        window.location.assign("'${html_url}${def_branch}'/index.html")'                >> index.html
          echo '      }'                                                                                >> index.html
          echo '    </script>'                                                                          >> index.html
          echo '  </head>'                                                                              >> index.html
          echo '  <body onload="redirect()">'                                                           >> index.html
          echo '    <p>Please follow the link to the <a href="'${html_url}${def_branch}'/index.html">'  >> index.html
          echo      ${def_branch}'</a> branch documentation.</p>'                                       >> index.html
          echo '  </body>'                                                                              >> index.html
          echo '</html>'                                                                                >> index.html
          git add index.html
      - name: Commit changes to the GitHub Pages branch
        run: |
          git status
          if git commit -m 'Pushing changes to GitHub Pages.'; then
            git push -f
          else
           echo "Nothing changed."
          fi


================================================
FILE: .gitignore
================================================
.DS_Store
.idea
.vscode
build
.clwb
cmake-build-debug/
docs/build
docs/source/README.md
docs/source/CONTRIBUTING.md
docs/source/api

================================================
FILE: .gitmodules
================================================
[submodule "tests/googletest"]
	path = tests/googletest
	url = https://github.com/google/googletest.git
	ignore = dirty


================================================
FILE: CMakeLists.txt
================================================
# Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

cmake_minimum_required(VERSION 3.10)
project(merlin-hkvs LANGUAGES CXX CUDA)
find_package(CUDAToolkit)

# TODO(Q3): target_compile_features below still declare cxx_std_14, which is
# inconsistent with the project-level C++17.  Update them to cxx_std_17 (or
# remove the per-target lines entirely) once downstream compatibility is
# confirmed.
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules)

option(CLANGFORMAT "Clangformat code files before compiling" OFF)
if(CLANGFORMAT)
  include(ClangFormat)
  file(GLOB_RECURSE clangformat_includes
    ${PROJECT_SOURCE_DIR}/include/*.h
    ${PROJECT_SOURCE_DIR}/include/*.hpp
    ${PROJECT_SOURCE_DIR}/include/*.cuh
  )
  file(GLOB clangformat_tests
    ${PROJECT_SOURCE_DIR}/tests/*.c
    ${PROJECT_SOURCE_DIR}/tests/*.h
    ${PROJECT_SOURCE_DIR}/tests/*.cpp
    ${PROJECT_SOURCE_DIR}/tests/*.hpp
    ${PROJECT_SOURCE_DIR}/tests/*.cu
    ${PROJECT_SOURCE_DIR}/tests/*.cuh
  )
  set(clangformat_files ${clangformat_includes} ${clangformat_tests})
  clangformat_setup("${clangformat_files}")
endif()

# Default to release build.
if (NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE "Release")
    message(STATUS "Setting default CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}")
endif()

# Some neat defaults.
set(CUDA_SEPARABLE_COMPILATION ON)

# Select target CUDA binary architecture.
foreach(cuda_arch ${sm})
  list(APPEND cuda_arch_list ${cuda_arch})
  message(STATUS "Assign GPU architecture (sm=${cuda_arch})")
endforeach()

list(LENGTH cuda_arch_list cuda_arch_list_length)
if(cuda_arch_list_length EQUAL 0)
  list(APPEND cuda_arch_list "80")
  message(STATUS "Assign default GPU architecture sm=80")
endif()

if (CMAKE_BUILD_TYPE STREQUAL "Debug")
  add_compile_definitions(CUDA_ERROR_CHECK)
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")
endif()

foreach(cuda_arch ${cuda_arch_list})
  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
endforeach()

message(CMAKE_CUDA_FLAGS="${CMAKE_CUDA_FLAGS}")

include_directories(
  ${PROJECT_SOURCE_DIR}/include
  ${PROJECT_SOURCE_DIR}/tests/googletest/googletest/include
)

ADD_SUBDIRECTORY(tests/googletest)

link_directories(
)

file(GLOB_RECURSE merlin_hkvs_src RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} *.cpp *.cu)

# TODO:
# add_library(hierarchical_kv STATIC ${hierarchical_kv_src})
# target_compile_features(hierarchical_kv PUBLIC cxx_std_14)
# target_link_libraries(hierarchical_kv PUBLIC ...)


add_executable(merlin_hashtable_benchmark benchmark/merlin_hashtable_benchmark.cc.cu)
target_compile_features(merlin_hashtable_benchmark PUBLIC cxx_std_14)
set_target_properties(merlin_hashtable_benchmark PROPERTIES  CUDA_ARCHITECTURES OFF)

add_executable(find_with_missed_keys_benchmark benchmark/find_with_missed_keys_benchmark.cc.cu)
target_compile_features(find_with_missed_keys_benchmark PUBLIC cxx_std_14)
set_target_properties(find_with_missed_keys_benchmark PROPERTIES  CUDA_ARCHITECTURES OFF)

add_executable(merlin_hashtable_test tests/merlin_hashtable_test.cc.cu)
target_compile_features(merlin_hashtable_test PUBLIC cxx_std_14)
set_target_properties(merlin_hashtable_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(merlin_hashtable_test gtest_main)

add_executable(find_or_insert_test tests/find_or_insert_test.cc.cu)
target_compile_features(find_or_insert_test PUBLIC cxx_std_14)
set_target_properties(find_or_insert_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(find_or_insert_test gtest_main)

add_executable(merlin_memory_pool_test tests/memory_pool_test.cc.cu)
target_compile_features(merlin_memory_pool_test PUBLIC cxx_std_14)
set_target_properties(merlin_memory_pool_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(merlin_memory_pool_test gtest_main)

set(CMAKE_BUILD_TYPE "Debug")
add_executable(save_and_load_test tests/save_and_load_test.cc.cu)
target_compile_features(save_and_load_test PUBLIC cxx_std_14)
set_target_properties(save_and_load_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(save_and_load_test gtest_main)

add_executable(insert_and_evict_test tests/insert_and_evict_test.cc.cu)
target_compile_features(insert_and_evict_test PUBLIC cxx_std_14)
set_target_properties(insert_and_evict_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(insert_and_evict_test gtest_main)

add_executable(dynamic_max_capacity_test tests/dynamic_max_capacity_test.cc.cu)
target_compile_features(dynamic_max_capacity_test PUBLIC cxx_std_14)
set_target_properties(dynamic_max_capacity_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(dynamic_max_capacity_test gtest_main)

add_executable(group_lock_test tests/group_lock_test.cc.cu)
target_compile_features(group_lock_test PUBLIC cxx_std_14)
set_target_properties(group_lock_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(group_lock_test gtest_main)

add_executable(find_or_insert_ptr_test tests/find_or_insert_ptr_test.cc.cu)
target_compile_features(find_or_insert_ptr_test PUBLIC cxx_std_14)
set_target_properties(find_or_insert_ptr_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(find_or_insert_ptr_test gtest_main)

add_executable(assign_score_test tests/assign_score_test.cc.cu)
target_compile_features(assign_score_test PUBLIC cxx_std_14)
set_target_properties(assign_score_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(assign_score_test gtest_main)

add_executable(uint32_score_test tests/uint32_score_test.cc.cu)
target_compile_features(uint32_score_test PUBLIC cxx_std_14)
set_target_properties(uint32_score_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(uint32_score_test gtest_main)

add_executable(accum_or_assign_test tests/accum_or_assign_test.cc)
target_compile_features(accum_or_assign_test PUBLIC cxx_std_14)
set_target_properties(accum_or_assign_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(accum_or_assign_test gtest_main)

add_executable(assign_values_test tests/assign_values_test.cc.cu)
target_compile_features(assign_values_test PUBLIC cxx_std_14)
set_target_properties(assign_values_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(assign_values_test gtest_main)

add_executable(find_with_missed_keys_test tests/find_with_missed_keys_test.cc.cu)
target_compile_features(find_with_missed_keys_test PUBLIC cxx_std_14)
set_target_properties(find_with_missed_keys_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(find_with_missed_keys_test gtest_main)

add_executable(reserved_keys_test tests/reserved_keys_test.cc.cu)
target_compile_features(reserved_keys_test PUBLIC cxx_std_14)
set_target_properties(reserved_keys_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(reserved_keys_test gtest_main)

add_executable(export_batch_if_test tests/export_batch_if_test.cc.cu)
target_compile_features(export_batch_if_test PUBLIC cxx_std_14)
set_target_properties(export_batch_if_test PROPERTIES  CUDA_ARCHITECTURES OFF)

add_executable(find_or_insert_ptr_lock_test tests/find_or_insert_ptr_lock_test.cc.cu)
target_compile_features(find_or_insert_ptr_lock_test PUBLIC cxx_std_14)
set_target_properties(find_or_insert_ptr_lock_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(find_or_insert_ptr_lock_test gtest_main)

add_executable(lock_unlock_test tests/lock_unlock_test.cc.cu)
target_compile_features(lock_unlock_test PUBLIC cxx_std_14)
set_target_properties(lock_unlock_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(lock_unlock_test gtest_main)

add_executable(dual_bucket_test tests/dual_bucket_test.cc.cu)
target_compile_features(dual_bucket_test PUBLIC cxx_std_14)
set_target_properties(dual_bucket_test PROPERTIES  CUDA_ARCHITECTURES OFF)
TARGET_LINK_LIBRARIES(dual_bucket_test gtest_main)

add_executable(dual_bucket_benchmark benchmark/dual_bucket_benchmark.cc.cu)
target_compile_features(dual_bucket_benchmark PUBLIC cxx_std_14)
set_target_properties(dual_bucket_benchmark PROPERTIES  CUDA_ARCHITECTURES OFF)


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing

## About HierarchicalKV

HierarchicalKV is a part of NVIDIA Merlin and provides hierarchical key-value storage to meet RecSys requirements.

The key capability of HierarchicalKV is to store key-value (feature-embedding) on high-bandwidth memory (HBM) of GPUs and in host memory.

You can also use the library for generic key-value storage.

## Maintainership

HierarchicalKV is co-maintianed by [NVIDIA Merlin Team](https://github.com/NVIDIA-Merlin) and NVIDIA product end-users,
and also open for public contributions, bug fixes, and documentation. This project adheres to NVIDIA's Code of Conduct.

## Contributing

We’re grateful for your interest in HierarchicalKV and value your contributions. 
We welcome contributions via pull requests(PR). 

Before sending out a pull request for significant change on the end-user API, we recommend you open an issue and
discuss your proposed change. Some changes may require a design review.
All submissions require review by project reviewers.

### Coding Style

Refer to the [Style Guide](http://github.com/NVIDIA-Merlin/HierarchicalKV/STYLE_GUIDE.md)

### Additional Requirements

In addition to the above requirements, contribution also needs to meet the following criteria:
* The change needs to include unit tests and integration tests if any.
* Each PR needs to provide necessary documentation for when and how to use it.

## Community

* HierarchicalKV code (https://github.com/NVIDIA-Merlin/HierarchicalKV)

## Licence
Apache License 2.0


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "{}"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright 2022 NVIDIA Corporation

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

================================================
FILE: README.md
================================================
# [NVIDIA HierarchicalKV(Beta)](https://github.com/NVIDIA-Merlin/HierarchicalKV)

[![Version](https://img.shields.io/github/v/release/NVIDIA-Merlin/HierarchicalKV?color=orange&include_prereleases)](https://github.com/NVIDIA-Merlin/HierarchicalKV/releases)
[![GitHub License](https://img.shields.io/github/license/NVIDIA-Merlin/HierarchicalKV)](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/LICENSE)
[![Documentation](https://img.shields.io/badge/documentation-blue.svg)](https://nvidia-merlin.github.io/HierarchicalKV/master/README.html)

## About HierarchicalKV

HierarchicalKV is a part of NVIDIA Merlin and provides hierarchical key-value storage to meet RecSys requirements.

The key capability of HierarchicalKV is to store key-value (feature-embedding) on high-bandwidth memory (HBM) of GPUs and in host memory.

You can also use the library for generic key-value storage.

## Benefits

When building large recommender systems, machine learning (ML) engineers face the following challenges:

- GPUs are needed, but HBM on a single GPU is too small for the large DLRMs that scale to several terabytes.
- Improving communication performance is getting more difficult in larger and larger CPU clusters.
- It is difficult to efficiently control consumption growth of limited HBM with customized strategies.
- Most generic key-value libraries provide low HBM and host memory utilization.

HierarchicalKV alleviates these challenges and helps the machine learning engineers in RecSys with the following benefits:

- Supports training large RecSys models on **HBM and host memory** at the same time.
- Provides better performance by **full bypassing CPUs** and reducing the communication workload.
- Implements table-size restraint strategies that are based on **LRU or customized strategies**.
  The strategies are implemented by CUDA kernels.
- Operates at a high working-status load factor that is close to 1.0.


## Key ideas

- Buckets are locally ordered
- Store keys and values separately
- Store all the keys in HBM
- Build-in and customizable eviction strategy

HierarchicalKV makes NVIDIA GPUs more suitable for training large and super-large models of ***search, recommendations, and advertising***.
The library simplifies the common challenges to building, evaluating, and serving sophisticated recommenders models.

## API Documentation

The main classes and structs are below, but reading the comments in the source code is recommended:

- [`class HashTable`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L151)
- [`class EvictStrategy`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L52)
- [`struct HashTableOptions`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L60)

For regular API doc, please refer to [API Docs](https://nvidia-merlin.github.io/HierarchicalKV/master/api/index.html)

### API Maturity Matrix

`industry-validated` means the API has been well-tested and verified in at least one real-world scenario.

| Name                 | Description                                                                                                              | Function           |
|:---------------------|:-------------------------------------------------------------------------------------------------------------------------|:-------------------|
| __insert_or_assign__ | Insert or assign for the specified keys. <br>Overwrite one key with minimum score when bucket is full.                   | industry-validated |
| __insert_and_evict__ | Insert new keys, and evict keys with minimum score when bucket is full.                                                  | industry-validated |
| __find_or_insert__   | Search for the specified keys, and insert them when missed.                                                              | well-tested        |
| __assign__           | Update for each key and bypass when missed.                                                                              | well-tested        |
| __accum_or_assign__  | Search and update for each key. If found, add value as a delta to the original value. <br>If missed, update it directly. | well-tested        |
| __find_or_insert\*__ | Search for the specified keys and return the pointers of values. Insert them firstly when missing.                       | well-tested        |
| __find__             | Search for the specified keys.                                                                                           | industry-validated |
| __find\*__           | Search and return the pointers of values, thread-unsafe but with high performance.                                       | well-tested        |
| __export_batch__     | Exports a certain number of the key-value-score tuples.                                                                  | industry-validated |
| __export_batch_if__  | Exports a certain number of the key-value-score tuples which match specific conditions.                                  | industry-validated |
| __warmup__           | Move the hot key-values from HMEM to HBM                                                                                 | June 15, 2023      |


### Evict Strategy

The `score` is introduced to define the importance of each key, the larger, the more important, the less likely they will be evicted. Eviction only happens when a bucket is full.
The `score_type` must be `uint64_t`. For more detail, please refer to [`class EvictStrategy`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L52).

| Name           | Definition of `Score`                                                                                                                                                                                           |
|:---------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| __Lru__        | Device clock in a nanosecond, which could differ slightly from host clock.                                                                                                                                      |
| __Lfu__        | Frequency increment provided by caller via the input parameter of `scores` of `insert-like` APIs as the increment of frequency.                                                                                 |
| __EpochLru__   | The high 32bits is the global epoch provided via the input parameter of `global_epoch`, <br>the low 32bits is equal to `(device_clock >> 20) & 0xffffffff` with granularity close to 1 ms.                      |
| __EpochLfu__   | The high 32bits is the global epoch provided via the input parameter of `global_epoch`, <br>the low 32bits is the frequency, <br>the frequency will keep constant after reaching the max value of `0xffffffff`. |
| __Customized__ | Fully provided by the caller via the input parameter of `scores` of `insert-like` APIs.                                                                                                                         |


* __Note__:
  - The `insert-like` APIs mean the APIs of `insert_or_assign`, `insert_and_evict`, `find_or_insert`, `accum_or_assign`, and `find_or_insert`. 
  - The `global_epoch` should be maintained by the caller and input as the input parameter of `insert-like` APIs.

### Configuration Options

It's recommended to keep the default configuration for the options ending with `*`.

| Name                       | Type   | Default | Description                                           |
|:---------------------------|:-------|:--------|:------------------------------------------------------|
| __init_capacity__          | size_t | 0       | The initial capacity of the hash table.               |
| __max_capacity__           | size_t | 0       | The maximum capacity of the hash table.               |
| __max_hbm_for_vectors__    | size_t | 0       | The maximum HBM for vectors, in bytes.                |
| __dim__                    | size_t | 64      | The dimension of the value vectors.                   |
| __max_bucket_size*__       | size_t | 128     | The length of each bucket.                            |
| __max_load_factor*__       | float  | 0.5f    | The max load factor before rehashing.                 |
| __block_size*__            | int    | 128     | The default block size for CUDA kernels.              |
| __io_block_size*__         | int    | 1024    | The block size for IO CUDA kernels.                   |
| __device_id*__             | int    | -1      | The ID of device. Managed internally when set to `-1` |
| __io_by_cpu*__             | bool   | false   | The flag indicating if the CPU handles IO.            |
| __reserved_key_start_bit__ | int    | 0       | The start bit offset of reserved key in the 64 bit    |

- Fore more details refer to [`struct HashTableOptions`](https://github.com/NVIDIA-Merlin/HierarchicalKV/blob/master/include/merlin_hashtable.cuh#L60).

#### Reserved Keys
- By default, the keys of `0xFFFFFFFFFFFFFFFD`, `0xFFFFFFFFFFFFFFFE`, and `0xFFFFFFFFFFFFFFFF` are reserved for internal using.
  change  `options.reserved_key_start_bit` if you want to use the above keys.
  `reserved_key_start_bit` has a valid range from 0 to 62. The default value is 0, which is the above default reserved keys. When `reserved_key_start_bit` is set to any value other than 0, the least significant bit (bit 0) is always `0` for any reserved key.

- Setting `reserved_key_start_bit = 1`:
  - This setting reserves the two least significant bits 1 and 2 for the reserved keys.
  - In binary, the last four bits range from `1000` to `1110`. Here, the least significant bit (bit 0) is always `0`, and bits from 3 to 63 are set to `1`.
  - The new reserved keys in hexadecimal representation are as follows:
    - `0xFFFFFFFFFFFFFFFE`
    - `0xFFFFFFFFFFFFFFFC`
    - `0xFFFFFFFFFFFFFFF8`
    - `0xFFFFFFFFFFFFFFFA`

- Setting `reserved_key_start_bit = 2`:
  - This configuration reserves bits 2 and 3 as reserved keys.
  - The binary representation for the last five bits ranges from `10010` to `11110`, with the least significant bit (bit 0) always set to `0`, and bits from 4 to 63 are set to `1`.

- if you change the reserved_key_start_bit, you should use same value for save/load
  For more detail, please refer to [`init_reserved_keys`](https://github.com/search?q=repo%3ANVIDIA-Merlin%2FHierarchicalKV%20init_reserved_keys&type=code)

### How to use:
```cpp
#include "merlin_hashtable.cuh"


using TableOptions = nv::merlin::HashTableOptions;
using EvictStrategy = nv::merlin::EvictStrategy;

int main(int argc, char *argv[])
{
  using K = uint64_t;
  using V = float;
  using S = uint64_t;
  
  // 1. Define the table and use LRU eviction strategy.
  using HKVTable = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;
  std::unique_ptr<HKVTable> table = std::make_unique<HKVTable>();
  
  // 2. Define the configuration options.
  TableOptions options;
  options.init_capacity = 16 * 1024 * 1024;
  options.max_capacity = options.init_capacity;
  options.dim = 16;
  options.max_hbm_for_vectors = nv::merlin::GB(16);
  
  
  // 3. Initialize the table memory resource.
  table->init(options);
  
  // 4. Use table to do something.
  
  return 0;
}

```

### Usage restrictions

- The `key_type` must be `int64_t` or `uint64_t`.
- The `score_type` must be `uint64_t`.
## Contributors

HierarchicalKV is co-maintianed by [NVIDIA Merlin Team](https://github.com/NVIDIA-Merlin) and NVIDIA product end-users,
and also open for public contributions, bug fixes, and documentation. [[Contribute](CONTRIBUTING.md)]

## How to build

Basically, HierarchicalKV is a headers only library, the commands below only create binaries for benchmark and unit testing.

Your environment must meet the following requirements:

- CUDA version >= 11.2
- NVIDIA GPU with compute capability 8.0, 8.6, 8.7 or 9.0
- GCC supports `C++17' standard or later.
- Bazel version >= 3.7.2 (Bazel compile only)

### with cmake
```shell
git clone --recursive https://github.com/NVIDIA-Merlin/HierarchicalKV.git
cd HierarchicalKV && mkdir -p build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -Dsm=80 .. && make -j
```

For Debug:
```shell
cmake -DCMAKE_BUILD_TYPE=Debug -Dsm=80 .. && make -j
```

For Benchmark:
```shell
./merlin_hashtable_benchmark
```

For Unit Test:
```shell
./merlin_hashtable_test
```

### with bazel

- DON'T use the option of `--recursive` for `git clone`.
- Please modify the environment variables in the `.bazelrc` file in advance if using the customized docker images.
- The docker images maintained on `nvcr.io/nvidia/tensorflow` are highly recommended.

Pull the docker image:
```shell
docker pull nvcr.io/nvidia/tensorflow:22.09-tf2-py3
docker run --gpus all -it --rm nvcr.io/nvidia/tensorflow:22.09-tf2-py3
```

Compile in docker container:
```shell
git clone https://github.com/NVIDIA-Merlin/HierarchicalKV.git
cd HierarchicalKV && bash bazel_build.sh
```

For Benchmark:
```shell
./benchmark_util
```


## Benchmark & Performance(W.I.P)

* GPU: 1 x NVIDIA A100 80GB PCIe: 8.0
* Key Type = uint64_t
* Value Type = float32 * {dim}
* Key-Values per OP = 1048576
* Evict strategy: LRU
* `λ`: load factor
* `find*` means the `find` API that directly returns the addresses of values.
* `find_or_insert*` means the `find_or_insert` API that directly returns the addresses of values.
* ***Throughput Unit: Billion-KV/second***

### On pure HBM mode: 

* dim = 8, capacity = 128 Million-KV, HBM = 4 GB, HMEM = 0 GB

|    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* | insert_and_evict |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
| 0.50 |            1.093 |  2.470 |          1.478 |  1.770 |  3.726 |           1.447 |            1.075 |
| 0.75 |            1.045 |  2.452 |          1.335 |  1.807 |  3.374 |           1.309 |            1.013 |
| 1.00 |            0.655 |  2.481 |          0.612 |  1.815 |  1.865 |           0.619 |            0.511 |

|    λ | export_batch | export_batch_if | contains |
|-----:|-------------:|----------------:|---------:|
| 0.50 |        2.087 |          12.258 |    3.121 |
| 0.75 |        2.045 |          12.447 |    3.094 |
| 1.00 |        1.950 |           2.657 |    3.096 |

* dim = 32, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 0 GB

|    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* | insert_and_evict |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
| 0.50 |            0.961 |  2.272 |          1.278 |  1.706 |  3.718 |           1.435 |            0.931 |
| 0.75 |            0.930 |  2.238 |          1.177 |  1.693 |  3.369 |           1.316 |            0.866 |
| 1.00 |            0.646 |  2.321 |          0.572 |  1.783 |  1.873 |           0.618 |            0.469 |

|    λ | export_batch | export_batch_if | contains |
|-----:|-------------:|----------------:|---------:|
| 0.50 |        0.692 |          10.784 |    3.100 |
| 0.75 |        0.569 |          10.240 |    3.075 |
| 1.00 |        0.551 |           0.765 |    3.096 |

* dim = 64, capacity = 64 Million-KV, HBM = 16 GB, HMEM = 0 GB

|    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* | insert_and_evict |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
| 0.50 |            0.834 |  1.982 |          1.113 |  1.499 |  3.950 |           1.502 |            0.805 |
| 0.75 |            0.801 |  1.951 |          1.033 |  1.493 |  3.545 |           1.359 |            0.773 |
| 1.00 |            0.621 |  2.021 |          0.608 |  1.541 |  1.965 |           0.613 |            0.481 |

|    λ | export_batch | export_batch_if | contains |
|-----:|-------------:|----------------:|---------:|
| 0.50 |        0.316 |           8.199 |    3.239 |
| 0.75 |        0.296 |           8.549 |    3.198 |
| 1.00 |        0.288 |           0.395 |    3.225 |

### On HBM+HMEM hybrid mode: 

* dim = 64, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 16 GB

|    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|
| 0.50 |            0.083 |  0.124 |          0.109 |  0.131 |  3.705 |           1.435 |
| 0.75 |            0.083 |  0.122 |          0.111 |  0.129 |  3.221 |           1.274 |
| 1.00 |            0.073 |  0.123 |          0.095 |  0.126 |  1.854 |           0.617 |

|    λ | export_batch | export_batch_if | contains |
|-----:|-------------:|----------------:|---------:|
| 0.50 |        0.318 |           8.086 |    3.122 |
| 0.75 |        0.294 |           5.549 |    3.111 |
| 1.00 |        0.287 |           0.393 |    3.075 |

* dim = 64, capacity = 512 Million-KV, HBM = 32 GB, HMEM = 96 GB

|    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|
| 0.50 |            0.049 |  0.069 |          0.049 |  0.069 |  3.484 |           1.370 |
| 0.75 |            0.049 |  0.069 |          0.049 |  0.069 |  3.116 |           1.242 |
| 1.00 |            0.047 |  0.072 |          0.047 |  0.070 |  1.771 |           0.607 |

|    λ | export_batch | export_batch_if | contains |
|-----:|-------------:|----------------:|---------:|
| 0.50 |        0.316 |           8.181 |    3.073 |
| 0.75 |        0.293 |           8.950 |    3.052 |
| 1.00 |        0.292 |           0.394 |    3.026 |

### Support and Feedback:

If you encounter any issues or have questions, go to [https://github.com/NVIDIA-Merlin/HierarchicalKV/issues](https://github.com/NVIDIA-Merlin/HierarchicalKV/issues) and submit an issue so that we can provide you with the necessary resolutions and answers.

### Acknowledgment
We are very grateful to external initial contributors [@Zhangyafei](https://github.com/zhangyafeikimi) and [@Lifan](https://github.com/Lifann) for their design, coding, and review work.

### License
Apache License 2.0


================================================
FILE: STYLE_GUIDE.md
================================================
#### C++
C++ code should conform to [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).

HierarchicalKV uses [clang-format](https://clang.llvm.org/docs/ClangFormat.html)
to check your C/C++ changes. Sometimes you have some manually formatted
code that you don’t want clang-format to touch.
You can disable formatting like this:

```cpp
int formatted_code;
// clang-format off
    void    unformatted_code  ;
// clang-format on
void formatted_code_again;
```

Install Clang-format (the version 18.1.3 is required) for Ubuntu:

```bash
sudo apt install clang-format-18
```

format all with:
```bash
find ./ \( -path ./tests/googletest -prune \) -o \( -iname *.h -o -iname *.cpp -o -iname *.cc -o -iname *.cu -o -iname *.cuh -o -iname *.hpp \) -print | xargs clang-format-18 -i --style=file

```


================================================
FILE: WORKSPACE
================================================
workspace(name = "HierarchicalKV")

load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
load("//build_deps/gpus:configure.bzl", "cuda_configure")

http_archive(
    name = "bazel_skylib",
    sha256 = "1dde365491125a3db70731e25658dfdd3bc5dbdfd11b840b3e987ecf043c7ca0",
    urls = [
        "https://github.com/bazelbuild/bazel-skylib/releases/download/0.9.0/bazel_skylib-0.9.0.tar.gz",
    ],
)

cuda_configure(name = "local_config_cuda")


================================================
FILE: bazel_build.sh
================================================
#!/bin/bash

# Usage : `./bazel_build.sh` or `bash bazel_build.sh`
set -e
export $(cat .bazeliskrc | xargs)

bazel build --config=cuda //...


================================================
FILE: benchmark/BUILD
================================================
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_cc_library")

cc_binary(
    name = "benchmark_util",
    deps = [
        ":benchmark_lib",
    ],
)

cuda_cc_library(
    name = "benchmark_lib",
    srcs = [
        "merlin_hashtable_benchmark.cc.cu",
    ],
    hdrs = [
        "benchmark_util.cuh",
    ],
    copts = ["-Iinclude/"],
    linkopts = ["-pthread"],
    deps = [
        "//include:merlin_hashtable",
        "@local_config_cuda//cuda",
    ],
)


================================================
FILE: benchmark/benchmark_util.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <chrono>
#include <cmath>
#include <cstdint>
#include "merlin/utils.cuh"

namespace benchmark {

enum class TimeUnit {
  Second = 0,
  MilliSecond = 3,
  MicroSecond = 6,
  NanoSecond = 9,
};

enum class API_Select {
  find = 0,
  insert_or_assign = 1,
  find_or_insert = 2,
  assign = 3,
  insert_and_evict = 4,
  find_ptr = 5,
  find_or_insert_ptr = 6,
  export_batch = 7,
  export_batch_if = 8,
  contains = 9,
};

enum class Hit_Mode {
  random = 0,
  last_insert = 1,
};

template <typename Rep>
struct Timer {
  explicit Timer(TimeUnit tu = TimeUnit::Second) : tu_(tu) {}
  void start() { startRecord = std::chrono::steady_clock::now(); }
  void end() { endRecord = std::chrono::steady_clock::now(); }
  Rep getResult() {
    auto duration_ = std::chrono::duration_cast<std::chrono::nanoseconds>(
        endRecord - startRecord);
    auto pow_ =
        static_cast<int32_t>(tu_) - static_cast<int32_t>(TimeUnit::NanoSecond);
    auto factor = static_cast<Rep>(std::pow(10, pow_));
    return static_cast<Rep>(duration_.count()) * factor;
  }

 private:
  TimeUnit tu_;
  std::chrono::time_point<std::chrono::steady_clock> startRecord{};
  std::chrono::time_point<std::chrono::steady_clock> endRecord{};
};

// RAII Timer using CUDA Event
template <typename Rep>
struct KernelTimer {
  explicit KernelTimer(TimeUnit tu = TimeUnit::Second) : tu_(tu) {
    CUDA_CHECK(cudaEventCreate(&start_));
    CUDA_CHECK(cudaEventCreate(&end_));
  }
  ~KernelTimer() {
    CUDA_CHECK(cudaEventDestroy(start_));
    CUDA_CHECK(cudaEventDestroy(end_));
  }
  void start() { CUDA_CHECK(cudaEventRecord(start_)); }
  void end() {
    CUDA_CHECK(cudaEventRecord(end_));
    CUDA_CHECK(cudaEventSynchronize(end_));
    CUDA_CHECK(cudaEventElapsedTime(&time, start_, end_));
  }
  Rep getResult() {
    auto pow_ =
        static_cast<int32_t>(tu_) - static_cast<int32_t>(TimeUnit::MilliSecond);
    auto factor = static_cast<Rep>(std::pow(10, pow_));
    return static_cast<Rep>(time * factor);
  }

 private:
  TimeUnit tu_;
  float time{-1.0f};
  cudaEvent_t start_;
  cudaEvent_t end_;
};

inline uint64_t getTimestamp() {
  return std::chrono::duration_cast<std::chrono::milliseconds>(
             std::chrono::system_clock::now().time_since_epoch())
      .count();
}

template <class K, class S>
void create_continuous_keys(K* h_keys, S* h_scores, const int key_num_per_op,
                            const K start = 0, int freq_range = 1000) {
  for (K i = 0; i < key_num_per_op; i++) {
    h_keys[i] = start + static_cast<K>(i);
    if (h_scores != nullptr) h_scores[i] = h_keys[i] % freq_range;
  }
}

template <class K, class S>
void create_random_keys(K* h_keys, S* h_scores, const int key_num_per_op) {
  std::unordered_set<K> numbers;
  std::random_device rd;
  std::mt19937_64 eng(rd());
  std::uniform_int_distribution<K> distr;
  int i = 0;

  while (numbers.size() < key_num_per_op) {
    numbers.insert(distr(eng));
  }
  for (const K num : numbers) {
    h_keys[i] = num;
    if (h_scores != nullptr) h_scores[i] = getTimestamp();
    i++;
  }
}

template <typename K, typename S>
void create_keys_for_hitrate(K* h_keys, S* h_scores, const int key_num_per_op,
                             const float hitrate = 0.6f,
                             const Hit_Mode hit_mode = Hit_Mode::last_insert,
                             const K end = 0, const bool reset = false,
                             int freq_range = 1000) {
  int divide = static_cast<int>(key_num_per_op * hitrate);
  if (Hit_Mode::random == hit_mode) {
    std::random_device rd;
    std::mt19937_64 eng(rd());
    K existed_max = end == 0 ? 1 : (end - 1);
    std::uniform_int_distribution<K> distr(0, existed_max);

    if (existed_max < divide) {
      std::cout << "# Can not generate enough keys for hit!";
      exit(-1);
    }
    std::unordered_set<K> numbers;
    while (numbers.size() < divide) {
      numbers.insert(distr(eng));
    }
    int i = 0;
    for (auto existed_value : numbers) {
      h_keys[i] = existed_value;
      if (h_scores != nullptr) h_scores[i] = h_keys[i] % freq_range;
      i++;
    }
  } else {
    // else keep its original value, but update scores
    for (int i = 0; i < divide; i++) {
      if (h_scores != nullptr) h_scores[i] = getTimestamp() % freq_range;
    }
  }

  static K new_value = std::numeric_limits<K>::max();
  if (reset) {
    new_value = std::numeric_limits<K>::max();
  }
  for (int i = divide; i < key_num_per_op; i++) {
    h_keys[i] = new_value--;
    if (h_scores != nullptr) h_scores[i] = getTimestamp() % freq_range;
  }
}

template <typename S>
void refresh_scores(S* h_scores, const int key_num_per_op) {
  for (int i = 0; i < key_num_per_op; i++) {
    h_scores[i] = getTimestamp();
  }
}

template <class K, class V>
void init_value_using_key(K* h_keys, V* h_vectors, const int key_num_per_op,
                          int dim) {
  for (size_t i = 0; i < key_num_per_op; i++) {
    for (size_t j = 0; j < dim; j++) {
      h_vectors[i * dim + j] = static_cast<V>(h_keys[i] * 0.00001);
    }
  }
}

template <class V>
__global__ void read_from_ptr_kernel(const V* const* __restrict src,
                                     V* __restrict dst, const size_t dim,
                                     size_t N) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;

  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
    int vec_index = int(t / dim);
    int dim_index = t % dim;
    if (src[vec_index]) {
      dst[vec_index * dim + dim_index] = src[vec_index][dim_index];
    }
  }
}

template <class V>
void read_from_ptr(const V* const* __restrict src, V* __restrict dst,
                   const size_t dim, size_t n, cudaStream_t stream) {
  const size_t block_size = 1024;
  const size_t N = n * dim;
  const size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(N, block_size);

  read_from_ptr_kernel<V>
      <<<grid_size, block_size, 0, stream>>>(src, dst, dim, N);
}

template <class V>
__global__ void array2ptr_kernel(V** ptr, V* __restrict array, const size_t dim,
                                 size_t N) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;

  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
    int vec_index = int(t);
    ptr[vec_index] = array + vec_index * dim;
  }
}

template <class V>
void array2ptr(V** ptr, V* __restrict array, const size_t dim, size_t n,
               cudaStream_t stream) {
  const size_t block_size = 1024;
  const size_t N = n;
  const size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(N, block_size);

  array2ptr_kernel<V><<<grid_size, block_size, 0, stream>>>(ptr, array, dim, N);
}

template <class S>
__global__ void host_nano_kernel(S* d_clk) {
  S mclk;
  asm volatile("mov.u64 %0,%%globaltimer;" : "=l"(mclk));
  *d_clk = mclk;
}

template <class S>
S host_nano(cudaStream_t stream = 0) {
  S h_clk = 0;
  S* d_clk;

  CUDA_CHECK(cudaMalloc((void**)&(d_clk), sizeof(S)));
  host_nano_kernel<S><<<1, 1, 0, stream>>>(d_clk);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  CUDA_CHECK(cudaMemcpy(&h_clk, d_clk, sizeof(S), cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaFree(d_clk));
  return h_clk;
}

template <class K, class S>
struct ExportIfPredFunctor {
  __forceinline__ __device__ bool operator()(const K& key, S& score,
                                             const K& pattern,
                                             const S& threshold) {
    return score > threshold;
  }
};

}  // namespace benchmark


================================================
FILE: benchmark/dual_bucket_benchmark.cc.cu
================================================
/*
 * Copyright (c) 2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <chrono>
#include <cstdio>
#include <iostream>
#include <numeric>
#include <random>
#include <vector>
#include "merlin_hashtable.cuh"

using K = uint64_t;
using V = float;
using S = uint64_t;
using TableOptions = nv::merlin::HashTableOptions;
using TableMode = nv::merlin::TableMode;
using EvictStrategy = nv::merlin::EvictStrategy;

template <typename Table>
double benchmark_insert(Table& table, size_t n, K* d_keys, V* d_values,
                        S* d_scores, cudaStream_t stream) {
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto start = std::chrono::high_resolution_clock::now();
  table.insert_or_assign(n, d_keys, d_values, d_scores, stream, true);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::high_resolution_clock::now();
  double ms = std::chrono::duration_cast<std::chrono::microseconds>(end - start)
                  .count() /
              1000.0;
  return static_cast<double>(n) / ms / 1000.0;  // Mops/s
}

template <typename Table>
double benchmark_find(Table& table, size_t n, K* d_keys, V* d_values,
                      bool* d_founds, cudaStream_t stream) {
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto start = std::chrono::high_resolution_clock::now();
  table.find(n, d_keys, d_values, d_founds, nullptr, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::high_resolution_clock::now();
  double ms = std::chrono::duration_cast<std::chrono::microseconds>(end - start)
                  .count() /
              1000.0;
  return static_cast<double>(n) / ms / 1000.0;  // Mops/s
}

void run_benchmark(size_t capacity, size_t dim, TableMode mode,
                   const char* mode_name) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  Table table;
  TableOptions options;
  options.init_capacity = capacity;
  options.max_capacity = capacity;
  options.max_hbm_for_vectors = 0;
  options.dim = dim;
  options.max_bucket_size = 128;
  options.table_mode = mode;
  table.init(options);

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  // Generate keys.
  size_t max_n = capacity;
  std::vector<K> h_keys(max_n);
  std::vector<V> h_values(max_n * dim, 1.0f);
  std::vector<S> h_scores(max_n);
  std::iota(h_keys.begin(), h_keys.end(), 1);
  for (size_t i = 0; i < max_n; i++) h_scores[i] = i + 1;

  K* d_keys;
  V* d_values;
  S* d_scores;
  bool* d_founds;
  V* d_found_values;
  CUDA_CHECK(cudaMalloc(&d_keys, max_n * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, max_n * dim * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, max_n * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_founds, max_n * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found_values, max_n * dim * sizeof(V)));

  CUDA_CHECK(cudaMemcpy(d_keys, h_keys.data(), max_n * sizeof(K),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), max_n * dim * sizeof(V),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), max_n * sizeof(S),
                        cudaMemcpyHostToDevice));

  printf("--- %s (capacity=%zuK, dim=%zu) ---\n", mode_name, capacity / 1024,
         dim);
  printf("  %-12s  %-18s  %-18s\n", "Load Factor", "Insert (Mops/s)",
         "Find (Mops/s)");

  float load_factors[] = {0.25f, 0.50f, 0.75f, 0.90f, 0.95f, 1.00f};
  size_t prev_n = 0;

  for (float lf : load_factors) {
    size_t target_n = static_cast<size_t>(capacity * lf);
    if (target_n > max_n) break;
    size_t batch_n = target_n - prev_n;
    if (batch_n == 0) continue;

    // Insert to reach target load factor.
    double insert_mops =
        benchmark_insert(table, batch_n, d_keys + prev_n,
                         d_values + prev_n * dim, d_scores + prev_n, stream);

    // Find all inserted keys.
    double find_mops = benchmark_find(table, target_n, d_keys, d_found_values,
                                      d_founds, stream);

    printf("  %-12.2f  %-18.1f  %-18.1f\n", lf, insert_mops, find_mops);
    prev_n = target_n;
  }

  // Memory efficiency: first eviction LF.
  // (Already covered in test, report here too.)
  size_t table_size = table.size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  printf("  Final size: %zu / %zu (LF=%.4f)\n", table_size, capacity,
         static_cast<float>(table_size) / capacity);

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_founds));
  CUDA_CHECK(cudaFree(d_found_values));
  CUDA_CHECK(cudaStreamDestroy(stream));
}

int main(int argc, char** argv) {
  printf("=== Dual-Bucket Benchmark Results ===\n\n");

  // Default: 1M capacity, dim=64.
  size_t capacity = 128 * 1024 * 8;  // ~1M
  size_t dim = 64;

  if (argc > 1) capacity = static_cast<size_t>(atol(argv[1]));
  if (argc > 2) dim = static_cast<size_t>(atol(argv[2]));

  run_benchmark(capacity, dim, TableMode::kThroughput, "THROUGHPUT_MODE");
  printf("\n");
  run_benchmark(capacity, dim, TableMode::kMemory, "MEMORY_MODE");
  printf("\n");

  printf("=== Benchmark Complete ===\n");
  return 0;
}


================================================
FILE: benchmark/find_with_missed_keys_benchmark.cc.cu
================================================
/*
 * Copyright (c) 2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <assert.h>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <limits>
#include <random>
#include <string>
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include "benchmark_util.cuh"
#include "merlin_hashtable.cuh"

using K = uint64_t;
using V = float;
using S = uint64_t;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;
using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

void print_tile() {
  std::cout << std::endl
            << "|    \u03BB " << "| capacity " << "| max_hbm_for_vectors "
            << "| max_bucket_size " << "| dim " << "| missed_ratio "
            << "| througput(BillionKV/secs) ";
  std::cout << "|\n";

  //<< "| load_factor "
  std::cout << "|------"
            //<< "| capacity "
            << "|----------"
            //<< "| max_hbm_for_vectors "
            << "|---------------------"
            //<< "| max_bucket_size "
            << "|-----------------"
            //<< "| dim "
            << "|-----"
            //<< "| missed_ratio "
            << "|--------------"
            //<< "| througput(BillionKV/secs) "
            << "|---------------------------";
  std::cout << "|\n";
}

template <typename T>
void print_w(const T& t, size_t width) {
  std::cout << "|" << std::setw(width) << t;
}

void print_result(double load_factor, size_t capacity,
                  size_t max_hbm_for_vectors, size_t max_bucket_size,
                  size_t dim, double missed_ratio, float througput) {
  print_w(load_factor, 6);
  print_w(capacity, 10);
  print_w(max_hbm_for_vectors, 21);
  print_w(max_bucket_size, 17);
  print_w(dim, 5);
  print_w(missed_ratio, 14);
  print_w(througput, 27);
  std::cout << "|\n";
}

void test_find(size_t capacity, size_t dim, size_t max_hbm_for_vectors,
               double load_factor, size_t max_bucket_size,
               double missed_ratio) {
  MERLIN_CHECK(load_factor >= 0.0 && load_factor <= 1.0,
               "Invalid `load_factor`");
  K* h_keys;
  S* h_scores;
  V* h_vectors;

  TableOptions options;
  options.init_capacity = capacity;
  options.max_capacity = capacity;
  options.dim = dim;

  options.max_hbm_for_vectors = nv::merlin::MB(max_hbm_for_vectors);
  options.max_bucket_size = max_bucket_size;

  size_t key_num = capacity;
  CUDA_CHECK(cudaMallocHost(&h_keys, key_num * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, key_num * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, key_num * options.dim * sizeof(V)));

  K* d_keys;
  S* d_scores;
  V* d_vectors;
  K* d_missed_keys;
  int* d_missed_indices;
  int* d_missed_size;

  CUDA_CHECK(cudaMalloc(&d_keys, key_num * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, key_num * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, key_num * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_missed_keys, key_num * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_missed_indices, key_num * sizeof(int)));
  CUDA_CHECK(cudaMalloc(&d_missed_size, sizeof(int)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));
  // insert key-value
  size_t insert_num = (double)key_num * load_factor;
  benchmark::create_continuous_keys<K, S>(h_keys, h_scores, insert_num,
                                          0 /*start*/);
  benchmark::init_value_using_key<K, V>(h_keys, h_vectors, insert_num,
                                        options.dim);
  CUDA_CHECK(cudaMemcpy(d_keys, h_keys, insert_num * sizeof(K),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, insert_num * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                        insert_num * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));
  Table table;
  table.init(options);
  table.insert_or_assign(insert_num, d_keys, d_vectors, d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  // find key-value
  size_t find_num = (double)insert_num * (1.0 - missed_ratio);
  benchmark::create_continuous_keys<K, S>(h_keys, nullptr, find_num,
                                          0 /*start*/);
  benchmark::create_continuous_keys<K, S>(
      h_keys + find_num, nullptr, insert_num - find_num, insert_num /*start*/);
  CUDA_CHECK(cudaMemcpy(d_keys, h_keys, insert_num * sizeof(K),
                        cudaMemcpyHostToDevice));

  auto timer = benchmark::Timer<double>();
  timer.start();
  table.find(insert_num, d_keys, d_vectors, d_missed_keys, d_missed_indices,
             d_missed_size, d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  timer.end();

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_missed_keys));
  CUDA_CHECK(cudaFree(d_missed_indices));
  CUDA_CHECK(cudaFree(d_missed_size));

  CudaCheckError();
  float througput = insert_num / timer.getResult() / (1024 * 1024 * 1024.0f);
  print_result(load_factor, capacity, max_hbm_for_vectors, max_bucket_size, dim,
               missed_ratio, througput);
}

void test_main(double load_factor, double missed_ratio) {
  constexpr size_t CAPACITY = 100000000UL;
  print_tile();
  // pure HBM
  test_find(CAPACITY, 8, 8 * 1024UL, load_factor, 256, missed_ratio);
  test_find(CAPACITY, 8, 8 * 1024UL, load_factor, 128, missed_ratio);
  // hybrid
  test_find(CAPACITY, 8, 1 * 1024UL, load_factor, 256, missed_ratio);
  test_find(CAPACITY, 8, 1 * 1024UL, load_factor, 128, missed_ratio);
  // pure HMEM
  test_find(CAPACITY, 8, 0, load_factor, 256, missed_ratio);
  test_find(CAPACITY, 8, 0, load_factor, 128, missed_ratio);
}

int main() {
  test_main(0.2, 0);
  test_main(0.2, 0.5);
  test_main(0.2, 1.0);
  test_main(0.5, 0);
  test_main(0.5, 0.5);
  test_main(0.5, 1.0);
  test_main(1.0, 0);
  test_main(1.0, 0.5);
  test_main(1.0, 1.0);
  return 0;
}


================================================
FILE: benchmark/merlin_hashtable_benchmark.cc.cu
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <assert.h>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <limits>
#include <random>
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include "benchmark_util.cuh"
#include "merlin_hashtable.cuh"

using std::cerr;
using std::cout;
using std::endl;
using std::fixed;
using std::setfill;
using std::setprecision;
using std::setw;

using namespace nv::merlin;
using namespace benchmark;

enum class Test_Mode {
  pure_hbm = 0,
  hybrid = 1,
};

const float EPSILON = 0.001f;

std::string rep(int n) { return std::string(n, ' '); }

using K = uint64_t;
using S = uint64_t;
using V = float;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;

template <class Table>
float test_one_api(std::shared_ptr<Table>& table, const API_Select api,
                   const size_t dim, const size_t init_capacity,
                   const size_t key_num_per_op, const float load_factor,
                   const float hitrate = 0.6f) {
  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  CUDA_CHECK(cudaMallocHost(&h_keys, key_num_per_op * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, key_num_per_op * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, key_num_per_op * sizeof(V) * dim));
  CUDA_CHECK(cudaMallocHost(&h_found, key_num_per_op * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, key_num_per_op * sizeof(V) * dim));

  bool need_scores = (Table::evict_strategy == EvictStrategy::kLfu ||
                      Table::evict_strategy == EvictStrategy::kEpochLfu ||
                      Table::evict_strategy == EvictStrategy::kCustomized);

  K* d_keys;
  S* d_scores_real;
  S* d_scores;
  V* d_vectors;
  V* d_def_val;
  V** d_vectors_ptr;
  bool* d_found;
  K* d_keys_out;

  K* d_evict_keys;
  S* d_evict_scores;

  CUDA_CHECK(cudaMalloc(&d_keys, key_num_per_op * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_real, key_num_per_op * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, key_num_per_op * sizeof(V) * dim));
  CUDA_CHECK(cudaMalloc(&d_def_val, key_num_per_op * sizeof(V) * dim));
  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, key_num_per_op * sizeof(V*)));
  CUDA_CHECK(cudaMalloc(&d_found, key_num_per_op * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_keys_out, key_num_per_op * sizeof(K)));

  CUDA_CHECK(cudaMalloc(&d_evict_keys, key_num_per_op * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_evict_scores, key_num_per_op * sizeof(S)));

  CUDA_CHECK(cudaMemset(d_vectors, 1, key_num_per_op * sizeof(V) * dim));
  CUDA_CHECK(cudaMemset(d_def_val, 2, key_num_per_op * sizeof(V) * dim));
  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, key_num_per_op * sizeof(V*)));
  CUDA_CHECK(cudaMemset(d_found, 0, key_num_per_op * sizeof(bool)));

  d_scores = need_scores ? d_scores_real : nullptr;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  // initialize insert
  // step 1, no need to load load_factor
  uint64_t key_num_init = static_cast<uint64_t>(init_capacity * load_factor);
  const float target_load_factor = key_num_init * 1.0f / init_capacity;
  uint64_t key_num_remain = key_num_init % key_num_per_op == 0
                                ? key_num_per_op
                                : key_num_init % key_num_per_op;
  int32_t loop_num_init = (key_num_init + key_num_per_op - 1) / key_num_per_op;

  K start = 0UL;

  S threshold = benchmark::host_nano<S>();
  int global_epoch = 0;
  for (; global_epoch < loop_num_init; global_epoch++) {
    table->set_global_epoch(global_epoch);
    uint64_t key_num_cur_insert =
        global_epoch == loop_num_init - 1 ? key_num_remain : key_num_per_op;
    create_continuous_keys<K, S>(h_keys, h_scores, key_num_cur_insert, start);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, key_num_cur_insert * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores_real, h_scores,
                          key_num_cur_insert * sizeof(S),
                          cudaMemcpyHostToDevice));
    table->find_or_insert(key_num_cur_insert, d_keys, d_vectors_ptr, d_found,
                          d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    start += key_num_cur_insert;
  }

  // step 2
  float real_load_factor = table->load_factor(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  while (target_load_factor - real_load_factor > EPSILON) {
    auto key_num_append = static_cast<int64_t>(
        (target_load_factor - real_load_factor) * init_capacity);
    if (key_num_append <= 0) break;
    key_num_append =
        std::min(static_cast<int64_t>(key_num_per_op), key_num_append);
    create_continuous_keys<K, S>(h_keys, h_scores, key_num_append, start);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, key_num_append * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores_real, h_scores, key_num_append * sizeof(S),
                          cudaMemcpyHostToDevice));
    table->insert_or_assign(key_num_append, d_keys, d_vectors, d_scores,
                            stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    start += key_num_append;
    real_load_factor = table->load_factor(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
  }

  // For trigger the kernel selection in advance.
  int key_num_per_op_warmup = 1;
  for (int i = 0; i < 9; i++, global_epoch++) {
    table->set_global_epoch(global_epoch);
    switch (api) {
      case API_Select::find: {
        table->find(key_num_per_op_warmup, d_keys, d_vectors, d_found, d_scores,
                    stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        break;
      }
      case API_Select::insert_or_assign: {
        table->insert_or_assign(key_num_per_op_warmup, d_keys, d_vectors,
                                d_scores, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        break;
      }
      case API_Select::find_or_insert: {
        table->find_or_insert(key_num_per_op_warmup, d_keys, d_vectors,
                              d_scores, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        break;
      }
      case API_Select::assign: {
        table->assign(key_num_per_op_warmup, d_keys, d_def_val, d_scores,
                      stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        break;
      }
      case API_Select::insert_and_evict: {
        table->insert_and_evict(key_num_per_op_warmup, d_keys, d_vectors,
                                d_scores, d_evict_keys, d_def_val,
                                d_evict_scores, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        break;
      }
      case API_Select::find_ptr: {
        V** d_vectors_ptr = nullptr;
        CUDA_CHECK(
            cudaMalloc(&d_vectors_ptr, key_num_per_op_warmup * sizeof(V*)));
        benchmark::array2ptr(d_vectors_ptr, d_vectors, dim,
                             key_num_per_op_warmup, stream);

        CUDA_CHECK(cudaStreamSynchronize(stream));
        table->find(1, d_keys, d_vectors_ptr, d_found, d_scores, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        benchmark::read_from_ptr(d_vectors_ptr, d_vectors, dim,
                                 key_num_per_op_warmup, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        break;
      }
      case API_Select::find_or_insert_ptr: {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        CUDA_CHECK(cudaMalloc(&d_found, key_num_per_op_warmup * sizeof(bool)));
        CUDA_CHECK(
            cudaMalloc(&d_vectors_ptr, key_num_per_op_warmup * sizeof(V*)));
        benchmark::array2ptr(d_vectors_ptr, d_vectors, dim,
                             key_num_per_op_warmup, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        table->find_or_insert(key_num_per_op_warmup, d_keys, d_vectors_ptr,
                              d_found, d_scores, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
        break;
      }
      case API_Select::export_batch: {
        size_t* d_dump_counter = nullptr;
        CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));
        CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t)));

        table->export_batch(key_num_per_op_warmup, 0, d_dump_counter, d_keys,
                            d_vectors, d_scores, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_dump_counter));
        break;
      }
      case API_Select::export_batch_if: {
        size_t* d_dump_counter = nullptr;
        CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));
        CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t)));
        K pattern = 0;
        table->template export_batch_if<ExportIfPredFunctor>(
            pattern, threshold, key_num_per_op_warmup, 0, d_dump_counter,
            d_keys, d_vectors, d_scores, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_dump_counter));
        break;
      }
      case API_Select::contains: {
        table->contains(1, d_keys, d_found, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        break;
      }
      default: {
        std::cout << "[Unsupport API]\n";
      }
    }
  }
  create_keys_for_hitrate<K, S>(h_keys, h_scores, key_num_per_op, hitrate,
                                Hit_Mode::last_insert, start, true /*reset*/);
  CUDA_CHECK(cudaMemcpy(d_keys, h_keys, key_num_per_op * sizeof(K),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores_real, h_scores, key_num_per_op * sizeof(K),
                        cudaMemcpyHostToDevice));
  auto timer = benchmark::Timer<double>();
  global_epoch++;
  table->set_global_epoch(global_epoch);
  switch (api) {
    case API_Select::find: {
      timer.start();
      table->find(key_num_per_op, d_keys, d_vectors, d_found, d_scores, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      timer.end();
      break;
    }
    case API_Select::insert_or_assign: {
      timer.start();
      table->insert_or_assign(key_num_per_op, d_keys, d_vectors, d_scores,
                              stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      timer.end();
      break;
    }
    case API_Select::find_or_insert: {
      timer.start();
      table->find_or_insert(key_num_per_op, d_keys, d_vectors, d_scores,
                            stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      timer.end();
      break;
    }
    case API_Select::assign: {
      timer.start();
      table->assign(key_num_per_op, d_keys, d_def_val, d_scores, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      timer.end();
      break;
    }
    case API_Select::insert_and_evict: {
      timer.start();
      table->insert_and_evict(key_num_per_op, d_keys, d_vectors, d_scores,
                              d_evict_keys, d_def_val, d_evict_scores, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      timer.end();
      break;
    }
    case API_Select::find_ptr: {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, key_num_per_op * sizeof(V*)));
      benchmark::array2ptr(d_vectors_ptr, d_vectors, dim, key_num_per_op,
                           stream);

      CUDA_CHECK(cudaStreamSynchronize(stream));
      timer.start();
      table->find(key_num_per_op, d_keys, d_vectors_ptr, d_found, d_scores,
                  stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      timer.end();
      benchmark::read_from_ptr(d_vectors_ptr, d_vectors, dim, key_num_per_op,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
      break;
    }
    case API_Select::find_or_insert_ptr: {
      V** d_vectors_ptr = nullptr;
      bool* d_found;
      CUDA_CHECK(cudaMalloc(&d_found, key_num_per_op * sizeof(bool)));
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, key_num_per_op * sizeof(V*)));
      benchmark::array2ptr(d_vectors_ptr, d_vectors, dim, key_num_per_op,
                           stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      timer.start();
      table->find_or_insert(key_num_per_op, d_keys, d_vectors_ptr, d_found,
                            d_scores, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      timer.end();
      CUDA_CHECK(cudaFree(d_vectors_ptr));
      CUDA_CHECK(cudaFree(d_found));
      break;
    }
    case API_Select::export_batch: {
      size_t* d_dump_counter;

      // Try to export close to but less than `key_num_per_op` data.
      // It's normal to happen `illegal memory access` error occasionally.
      float safe_ratio = 0.995;

      CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));
      CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t)));
      timer.start();
      table->export_batch(key_num_per_op / target_load_factor * safe_ratio, 0,
                          d_dump_counter, d_keys, d_vectors, d_scores, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      timer.end();
      CUDA_CHECK(cudaFree(d_dump_counter));
      break;
    }
    case API_Select::export_batch_if: {
      size_t* d_dump_counter;

      // Try to export close to but less than `key_num_per_op` data.
      // It's normal to happen `illegal memory access` error occasionally.
      float safe_ratio = 0.995;

      CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));
      CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t)));
      timer.start();
      K pattern = 0;
      table->template export_batch_if<ExportIfPredFunctor>(
          pattern, threshold, key_num_per_op / target_load_factor * safe_ratio,
          0, d_dump_counter, d_keys, d_vectors, d_scores, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      timer.end();
      CUDA_CHECK(cudaFree(d_dump_counter));
      break;
    }
    case API_Select::contains: {
      timer.start();
      table->contains(key_num_per_op, d_keys, d_found, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      timer.end();
      break;
    }
    default: {
      std::cout << "[Unsupport API]\n";
    }
  }

  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores_real));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_def_val));
  CUDA_CHECK(cudaFree(d_vectors_ptr));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaFree(d_evict_keys));
  CUDA_CHECK(cudaFree(d_evict_scores));

  CUDA_CHECK(cudaDeviceSynchronize());
  CudaCheckError();

  float througput =
      key_num_per_op / timer.getResult() / (1024 * 1024 * 1024.0f);
  return througput;
}

static Test_Mode test_mode = Test_Mode::pure_hbm;

void print_title_a() {
  cout << endl
       << "|    \u03BB " << "| insert_or_assign " << "|   find "
       << "| find_or_insert " << "| assign " << "|  find* "
       << "| find_or_insert* ";
  if (Test_Mode::pure_hbm == test_mode) {
    cout << "| insert_and_evict ";
  }
  cout << "|\n";

  //<< "| load_factor "
  cout << "|-----:"
       //<< "| insert_or_assign "
       << "|-----------------:"
       //<< "|   find "
       << "|-------:"
       //<< "| find_or_insert "
       << "|---------------:"
       //<< "| assign "
       << "|-------:"
       //<< "|   find* "
       << "|-------:"
       //<< "| find_or_insert* "
       << "|----------------:";
  if (Test_Mode::pure_hbm == test_mode) {
    //<< "| insert_and_evict "
    cout << "|-----------------:";
  }
  cout << "|\n";
}

void print_title_b() {
  cout << endl
       << "|    \u03BB " << "| export_batch " << "| export_batch_if "
       << "|  contains ";
  cout << "|\n";

  //<< "| load_factor "
  cout << "|-----:"
       //<< "| export_batch "
       << "|-------------:"
       //<< "| export_batch_if "
       << "|----------------:"
       //<< "|  contains "
       << "|----------:";
  cout << "|\n";
}

void test_main(std::vector<API_Select>& apis, const size_t dim,
               const size_t init_capacity = 64 * 1024 * 1024UL,
               const size_t key_num_per_op = 1 * 1024 * 1024UL,
               const size_t hbm4values = 16, const float load_factor = 1.0f,
               const bool io_by_cpu = false,
               const std::vector<float> load_factors = {0.50f, 0.75f, 1.00f}) {
  size_t free, total;
  CUDA_CHECK(cudaSetDevice(0));
  CUDA_CHECK(cudaMemGetInfo(&free, &total));

  if (free / (1 << 30) < hbm4values) {
    std::cout << "free HBM is not enough, ignore current benchmark!"
              << std::endl;
    return;
  }
  TableOptions options;

  options.init_capacity = init_capacity;
  options.max_capacity = init_capacity;
  options.dim = dim;
  options.max_hbm_for_vectors = nv::merlin::GB(hbm4values);
  options.io_by_cpu = io_by_cpu;
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru, Sm80>;

  std::shared_ptr<Table> table = std::make_shared<Table>();
  table->init(options);

  for (float load_factor : load_factors) {
    std::cout << "|" << rep(1) << fixed << setprecision(2) << load_factor
              << " ";

    for (auto api : apis) {
      table->clear();
      CUDA_CHECK(cudaDeviceSynchronize());
      // There is a sampling of load_factor after several times call to target
      // API. Two consecutive calls can avoid the impact of sampling.
      auto res1 = test_one_api<Table>(table, api, dim, init_capacity,
                                      key_num_per_op, load_factor);
      auto res2 = test_one_api<Table>(table, api, dim, init_capacity,
                                      key_num_per_op, load_factor);
      auto res = std::max(res1, res2);
      std::cout << "|";
      switch (api) {
        case API_Select::find: {
          std::cout << rep(1);
          break;
        }
        case API_Select::insert_or_assign: {
          std::cout << rep(11);
          break;
        }
        case API_Select::find_or_insert: {
          std::cout << rep(9);
          break;
        }
        case API_Select::assign: {
          std::cout << rep(1);
          break;
        }
        case API_Select::insert_and_evict: {
          std::cout << rep(11);
          break;
        }
        case API_Select::find_ptr: {
          std::cout << rep(1);
          break;
        }
        case API_Select::find_or_insert_ptr: {
          std::cout << rep(10);
          break;
        }
        case API_Select::export_batch: {
          std::cout << rep(7);
          break;
        }
        case API_Select::export_batch_if: {
          std::cout << rep(10);
          break;
        }
        case API_Select::contains: {
          std::cout << rep(4);
          break;
        }
        default: {
          std::cout << "[Unsupport API]";
        }
      }
      std::cout << fixed << setprecision(3) << setw(6) << setfill(' ') << res
                << " ";
    }
    std::cout << "|\n";
  }
}

int main() {
  size_t key_num_per_op = 1 * 1024 * 1024UL;
  cudaDeviceProp props;
  CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
  cout << endl
       << "## Benchmark" << endl
       << endl
       << "* GPU: 1 x " << props.name << ": " << props.major << "."
       << props.minor << endl
       << "* Key Type = uint64_t" << endl
       << "* Value Type = float32 * {dim}" << endl
       << "* Key-Values per OP = " << key_num_per_op << endl
       << "* Evict strategy: LRU" << endl
       << "* `\u03BB`" << ": load factor" << endl
       << "* `find*` means the `find` API that directly returns the addresses "
          "of values."
       << endl
       << "* `find_or_insert*` means the `find_or_insert` API that directly "
          "returns the addresses of values."
       << endl
       << "* ***Throughput Unit: Billion-KV/second***" << endl
       << endl;
  auto print_configuration = [](const size_t dim, const size_t init_capacity,
                                const size_t hbm4values) {
    using V = float;
    int32_t capacity = static_cast<int32_t>(init_capacity / (1024 * 1024));
    size_t hmem4values = init_capacity * dim * sizeof(V) / (1024 * 1024 * 1024);
    hmem4values = hmem4values < hbm4values ? 0 : (hmem4values - hbm4values);
    cout << "\n* dim = " << dim << ", " << "capacity = " << capacity
         << " Million-KV, " << "HBM = " << hbm4values << " GB, "
         << "HMEM = " << hmem4values << " GB\n";
  };

  try {
    {
      std::vector<API_Select> apis_a{
          API_Select::insert_or_assign, API_Select::find,
          API_Select::find_or_insert,   API_Select::assign,
          API_Select::find_ptr,         API_Select::find_or_insert_ptr,
          API_Select::insert_and_evict};

      std::vector<API_Select> apis_b{API_Select::export_batch,
                                     API_Select::export_batch_if,
                                     API_Select::contains};
      test_mode = Test_Mode::pure_hbm;

      cout << "### On pure HBM mode: " << endl;
      print_configuration(8, 128 * 1024 * 1024UL, 4);
      print_title_a();
      test_main(apis_a, 8, 128 * 1024 * 1024UL, key_num_per_op, 4);

      print_title_b();
      test_main(apis_b, 8, 128 * 1024 * 1024UL, key_num_per_op, 4);

      print_configuration(32, 128 * 1024 * 1024UL, 16);
      print_title_a();
      test_main(apis_a, 32, 128 * 1024 * 1024UL, key_num_per_op, 16);

      print_title_b();
      test_main(apis_b, 32, 128 * 1024 * 1024UL, key_num_per_op, 16);

      print_configuration(64, 64 * 1024 * 1024UL, 16);
      print_title_a();
      test_main(apis_a, 64, 64 * 1024 * 1024UL, key_num_per_op, 16);

      print_title_b();
      test_main(apis_b, 64, 64 * 1024 * 1024UL, key_num_per_op, 16);

      cout << endl;
    }

    {
      std::vector<API_Select> apis_a{
          API_Select::insert_or_assign, API_Select::find,
          API_Select::find_or_insert,   API_Select::assign,
          API_Select::find_ptr,         API_Select::find_or_insert_ptr};

      std::vector<API_Select> apis_b{API_Select::export_batch,
                                     API_Select::export_batch_if,
                                     API_Select::contains};

      cout << "### On HBM+HMEM hybrid mode: " << endl;
      test_mode = Test_Mode::hybrid;
      print_configuration(64, 128 * 1024 * 1024UL, 16);
      print_title_a();
      test_main(apis_a, 64, 128 * 1024 * 1024UL, key_num_per_op, 16);

      print_title_b();
      test_main(apis_b, 64, 128 * 1024 * 1024UL, key_num_per_op, 16);

      print_configuration(64, 512 * 1024 * 1024UL, 32);
      print_title_a();
      test_main(apis_a, 64, 512 * 1024 * 1024UL, key_num_per_op, 32);

      print_title_b();
      test_main(apis_b, 64, 512 * 1024 * 1024UL, key_num_per_op, 32);
      cout << endl;
    }

    CUDA_CHECK(cudaDeviceSynchronize());
  } catch (const nv::merlin::CudaException& e) {
    cerr << e.what() << endl;
  }
  CUDA_CHECK(cudaDeviceSynchronize());
  return 0;
}


================================================
FILE: build_deps/gpus/BUILD
================================================


================================================
FILE: build_deps/gpus/check_cuda_libs.py
================================================
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Verifies that a list of libraries is installed on the system.

Takes a list of arguments with every two subsequent arguments being a logical
tuple of (path, check_soname). The path to the library and either True or False
to indicate whether to check the soname field on the shared library.

Example Usage:
./check_cuda_libs.py /path/to/lib1.so True /path/to/lib2.so False
"""
import os
import os.path
import platform
import subprocess
import sys

# pylint: disable=g-import-not-at-top,g-importing-member
try:
    from shutil import which
except ImportError:
    from distutils.spawn import find_executable as which
# pylint: enable=g-import-not-at-top,g-importing-member


class ConfigError(Exception):
    pass


def check_cuda_lib(path, check_soname=True):
    """Tests if a library exists on disk and whether its soname matches the filename.

  Args:
    path: the path to the library.
    check_soname: whether to check the soname as well.

  Raises:
    ConfigError: If the library does not exist or if its soname does not match
    the filename.
  """
    if not os.path.isfile(path):
        raise ConfigError("No library found under: " + path)
    objdump = which("objdump")
    if check_soname and objdump is not None:
        # Decode is necessary as in py3 the return type changed from str to bytes
        output = subprocess.check_output([objdump, "-p", path]).decode("utf-8")
        output = [line for line in output.splitlines() if "SONAME" in line]
        sonames = [line.strip().split(" ")[-1] for line in output]
        if not any(soname == os.path.basename(path) for soname in sonames):
            raise ConfigError("None of the libraries match their SONAME: " +
                              path)


def main():
    try:
        args = [argv for argv in sys.argv[1:]]
        if len(args) % 2 == 1:
            raise ConfigError("Expected even number of arguments")
        checked_paths = []
        for i in range(0, len(args), 2):
            path = args[i]
            check_cuda_lib(path, check_soname=args[i + 1] == "True")
            checked_paths.append(path)
        # pylint: disable=superfluous-parens
        print(os.linesep.join(checked_paths))
        # pylint: enable=superfluous-parens
    except ConfigError as e:
        sys.stderr.write(str(e))
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: build_deps/gpus/configure.bzl
================================================
"""Repository rule for CUDA autoconfiguration.

`cuda_configure` depends on the following environment variables:

  * `NEED_CUDA`: Whether to enable building with CUDA.
  * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
  * `SYSROOT`: The sysroot to use when compiling.
  * `CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
    `/usr/local/cuda,usr/`.
  * `CUDA_TOOLKIT_PATH` (deprecated): The path to the CUDA toolkit. Default is
    `/usr/local/cuda`.
  * `CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then
    use the system default.
  * `CUDNN_VERSION`: The version of the cuDNN library.
  * `CUDNN_INSTALL_PATH` (deprecated): The path to the cuDNN library. Default is
    `/usr/local/cuda`.
  * `CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
    `3.5,5.2`.
  * `PYTHON_BIN_PATH`: The python binary path
"""

load(
    "@bazel_tools//tools/cpp:lib_cc_configure.bzl",
    "escape_string",
    "get_env_var",
)
load(
    "//build_deps/remote_config:common.bzl",
    "config_repo_label",
    "err_out",
    "execute",
    "get_bash_bin",
    "get_cpu_value",
    "get_host_environ",
    "get_python_bin",
    "raw_exec",
    "read_dir",
    "realpath",
    "which",
)

_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
_GCC_HOST_COMPILER_PREFIX = "GCC_HOST_COMPILER_PREFIX"
_SYSROOT = "SYSROOT"
_CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
_CUDA_VERSION = "CUDA_VERSION"
_CUDNN_VERSION = "CUDNN_VERSION"
_CUDNN_INSTALL_PATH = "CUDNN_INSTALL_PATH"
_CUDA_COMPUTE_CAPABILITIES = "CUDA_COMPUTE_CAPABILITIES"
_CUDA_CONFIG_REPO = "CUDA_CONFIG_REPO"
_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"

_TENSORRT_VERSION = "TENSORRT_VERSION"
_TENSORRT_INSTALL_PATH = "TENSORRT_INSTALL_PATH"
_TENSORRT_STATIC_PATH = "TENSORRT_STATIC_PATH"
_TENSORRT_LIBS = [
    "nvinfer",
    "nvinfer_plugin",
    "nvonnxparser",
    "nvparsers",
]
_TENSORRT_HEADERS = [
    "NvInfer.h",
    "NvUtils.h",
    "NvInferPlugin.h",
]
_TENSORRT_HEADERS_V6 = [
    "NvInfer.h",
    "NvUtils.h",
    "NvInferPlugin.h",
    "NvInferVersion.h",
    "NvInferRuntime.h",
    "NvInferRuntimeCommon.h",
    "NvInferPluginUtils.h",
    "NvOnnxParser.h",
    "NvOnnxConfig.h",
]
_TENSORRT_HEADERS_V8 = [
    "NvInfer.h",
    "NvInferLegacyDims.h",
    "NvInferImpl.h",
    "NvUtils.h",
    "NvInferPlugin.h",
    "NvInferVersion.h",
    "NvInferRuntime.h",
    "NvInferRuntimeCommon.h",
    "NvInferPluginUtils.h",
    "NvOnnxParser.h",
    "NvOnnxConfig.h",
]

def _at_least_version(actual_version, required_version):
    actual = [int(v) for v in actual_version.split(".")]
    required = [int(v) for v in required_version.split(".")]
    return actual >= required

def _get_tensorrt_headers(tensorrt_version):
    if _at_least_version(tensorrt_version, "8"):
        return _TENSORRT_HEADERS_V8
    if _at_least_version(tensorrt_version, "6"):
        return _TENSORRT_HEADERS_V6
    return _TENSORRT_HEADERS

def to_list_of_strings(elements):
    """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.

    This is to be used to put a list of strings into the bzl file templates
    so it gets interpreted as list of strings in Starlark.

    Args:
      elements: list of string elements

    Returns:
      single string of elements wrapped in quotes separated by a comma."""
    quoted_strings = ["\"" + element + "\"" for element in elements]
    return ", ".join(quoted_strings)

def verify_build_defines(params):
    """Verify all variables that crosstool/BUILD.tpl expects are substituted.

    Args:
      params: dict of variables that will be passed to the BUILD.tpl template.
    """
    missing = []
    for param in [
        "cxx_builtin_include_directories",
        "extra_no_canonical_prefixes_flags",
        "host_compiler_path",
        "host_compiler_prefix",
        "host_compiler_warnings",
        "linker_bin_path",
        "compiler_deps",
        "unfiltered_compile_flags",
    ]:
        if ("%{" + param + "}") not in params:
            missing.append(param)

    if missing:
        auto_configure_fail(
            "BUILD.tpl template is missing these variables: " + str(missing) +
            ".\nWe only got: " + str(params) + ".",
        )

# TODO(dzc): Once these functions have been factored out of Bazel's
# cc_configure.bzl, load them from @bazel_tools instead.
# BEGIN cc_configure common functions.
def find_cc(repository_ctx):
    """Find the C++ compiler."""
    target_cc_name = "gcc"
    cc_path_envvar = _GCC_HOST_COMPILER_PATH
    cc_name = target_cc_name

    cc_name_from_env = get_host_environ(repository_ctx, cc_path_envvar)
    if cc_name_from_env:
        cc_name = cc_name_from_env
    if cc_name.startswith("/"):
        # Absolute path, maybe we should make this supported by our which function.
        return cc_name
    cc = which(repository_ctx, cc_name)
    if cc == None:
        fail(("Cannot find {}, either correct your path or set the {}" +
              " environment variable").format(target_cc_name, cc_path_envvar))
    return cc

_INC_DIR_MARKER_BEGIN = "#include <...>"

# OSX add " (framework directory)" at the end of line, strip it.
_OSX_FRAMEWORK_SUFFIX = " (framework directory)"
_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)

def _cxx_inc_convert(path):
    """Convert path returned by cc -E xc++ in a complete path."""
    path = path.strip()
    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
    return path

def _normalize_include_path(repository_ctx, path):
    """Normalizes include paths before writing them to the crosstool.

      If path points inside the 'crosstool' folder of the repository, a relative
      path is returned.
      If path points outside the 'crosstool' folder, an absolute path is returned.
      """
    path = str(repository_ctx.path(path))
    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))

    if path.startswith(crosstool_folder):
        # We drop the path to "$REPO/crosstool" and a trailing path separator.
        return path[len(crosstool_folder) + 1:]
    return path

def _is_compiler_option_supported(repository_ctx, cc, option):
    """Checks that `option` is supported by the C compiler. Doesn't %-escape the option."""
    result = repository_ctx.execute([
        cc,
        option,
        "-o",
        "/dev/null",
        "-c",
        str(repository_ctx.path("tools/cpp/empty.cc")),
    ])
    return result.stderr.find(option) == -1

def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp, tf_sysroot):
    """Compute the list of default C or C++ include directories."""
    if lang_is_cpp:
        lang = "c++"
    else:
        lang = "c"
    sysroot = []
    if tf_sysroot:
        sysroot += ["--sysroot", tf_sysroot]
    result = raw_exec(
        repository_ctx,
        [cc, "-E", "-x" + lang, "-", "-v"] + sysroot,
    )
    stderr = err_out(result)
    index1 = stderr.find(_INC_DIR_MARKER_BEGIN)
    if index1 == -1:
        return []
    index1 = stderr.find("\n", index1)
    if index1 == -1:
        return []
    index2 = stderr.rfind("\n ")
    if index2 == -1 or index2 < index1:
        return []
    index2 = stderr.find("\n", index2 + 1)
    if index2 == -1:
        inc_dirs = stderr[index1 + 1:]
    else:
        inc_dirs = stderr[index1 + 1:index2].strip()

    print_resource_dir_supported = _is_compiler_option_supported(
        repository_ctx,
        cc,
        "-print-resource-dir",
    )

    if print_resource_dir_supported:
        resource_dir = repository_ctx.execute(
            [cc, "-print-resource-dir"],
        ).stdout.strip() + "/share"
        inc_dirs += "\n" + resource_dir

    return [
        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
        for p in inc_dirs.split("\n")
    ]

def get_cxx_inc_directories(repository_ctx, cc, tf_sysroot):
    """Compute the list of default C and C++ include directories."""

    includes_cpp = _get_cxx_inc_directories_impl(
        repository_ctx,
        cc,
        True,
        tf_sysroot,
    )
    includes_c = _get_cxx_inc_directories_impl(
        repository_ctx,
        cc,
        False,
        tf_sysroot,
    )

    return includes_cpp + [
        inc
        for inc in includes_c
        if inc not in includes_cpp
    ]

def auto_configure_fail(msg):
    """Output failure message when cuda configuration fails."""
    red = "\033[0;31m"
    no_color = "\033[0m"
    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))

# END cc_configure common functions (see TODO above).

def _cuda_include_path(repository_ctx, cuda_config):
    """Generates the Starlark string with cuda include directories.

      Args:
        repository_ctx: The repository context.
        cc: The path to the gcc host compiler.

      Returns:
        A list of the gcc host compiler include directories.
      """
    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
        cuda_config.cuda_toolkit_path,
        ".exe" if cuda_config.cpu_value == "Windows" else "",
    ))

    # The expected exit code of this command is non-zero. Bazel remote execution
    # only caches commands with zero exit code. So force a zero exit code.
    cmd = "%s -v /dev/null -o /dev/null ; [ $? -eq 1 ]" % str(nvcc_path)
    result = raw_exec(
        repository_ctx,
        [get_bash_bin(repository_ctx), "-c", cmd],
    )
    target_dir = ""
    for one_line in err_out(result).splitlines():
        if one_line.startswith("#$ _TARGET_DIR_="):
            target_dir = (cuda_config.cuda_toolkit_path + "/" +
                          one_line.replace(
                              "#$ _TARGET_DIR_=",
                              "",
                          ) + "/include")
    inc_entries = []
    if target_dir != "":
        inc_entries.append(realpath(repository_ctx, target_dir))
    inc_entries.append(
        realpath(repository_ctx, cuda_config.cuda_toolkit_path + "/include"),
    )
    return inc_entries

def matches_version(environ_version, detected_version):
    """Checks whether the user-specified version matches the detected version.

      This function performs a weak matching so that if the user specifies only
      the
      major or major and minor versions, the versions are still considered
      matching
      if the version parts match. To illustrate:

          environ_version  detected_version  result
          -----------------------------------------
          5.1.3            5.1.3             True
          5.1              5.1.3             True
          5                5.1               True
          5.1.3            5.1               False
          5.2.3            5.1.3             False

      Args:
        environ_version: The version specified by the user via environment
          variables.
        detected_version: The version autodetected from the CUDA installation on
          the system.
      Returns: True if user-specified version matches detected version and False
        otherwise.
    """
    environ_version_parts = environ_version.split(".")
    detected_version_parts = detected_version.split(".")
    if len(detected_version_parts) < len(environ_version_parts):
        return False
    for i, part in enumerate(detected_version_parts):
        if i >= len(environ_version_parts):
            break
        if part != environ_version_parts[i]:
            return False
    return True

_NVCC_VERSION_PREFIX = "Cuda compilation tools, release "

_DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"

def compute_capabilities(repository_ctx):
    """Returns a list of strings representing cuda compute capabilities.

    Args:
      repository_ctx: the repo rule's context.
    Returns: list of cuda architectures to compile for. 'compute_xy' refers to
      both PTX and SASS, 'sm_xy' refers to SASS only.
    """
    capabilities = get_host_environ(
        repository_ctx,
        _CUDA_COMPUTE_CAPABILITIES,
        "compute_35,compute_52",
    ).split(",")

    # Map old 'x.y' capabilities to 'compute_xy'.
    if len(capabilities) > 0 and all(
        [len(x.split(".")) == 2 for x in capabilities],
    ):
        # If all capabilities are in 'x.y' format, only include PTX for the
        # highest capability.
        cc_list = sorted([x.replace(".", "") for x in capabilities])
        capabilities = [
            "sm_%s" % x
            for x in cc_list[:-1]
        ] + ["compute_%s" % cc_list[-1]]
    for i, capability in enumerate(capabilities):
        parts = capability.split(".")
        if len(parts) != 2:
            continue
        capabilities[i] = "compute_%s%s" % (parts[0], parts[1])

    # Make list unique
    capabilities = dict(zip(capabilities, capabilities)).keys()

    # Validate capabilities.
    for capability in capabilities:
        if not capability.startswith(("compute_", "sm_")):
            auto_configure_fail("Invalid compute capability: %s" % capability)
        for prefix in ["compute_", "sm_"]:
            if not capability.startswith(prefix):
                continue
            if len(capability) == len(prefix) + 2 and capability[-2:].isdigit(
            ):
                continue
            auto_configure_fail("Invalid compute capability: %s" % capability)

    return capabilities

def lib_name(base_name, cpu_value, version = None, static = False):
    """Constructs the platform-specific name of a library.

      Args:
        base_name: The name of the library, such as "cudart"
        cpu_value: The name of the host operating system.
        version: The version of the library.
        static: True the library is static or False if it is a shared object.

      Returns:
        The platform-specific name of the library.
      """
    version = "" if not version else "." + version
    if cpu_value in ("Linux", "FreeBSD"):
        if static:
            return "lib%s.a" % base_name
        return "lib%s.so%s" % (base_name, version)
    elif cpu_value == "Windows":
        return "%s.lib" % base_name
    elif cpu_value == "Darwin":
        if static:
            return "lib%s.a" % base_name
        return "lib%s%s.dylib" % (base_name, version)
    else:
        auto_configure_fail("Invalid cpu_value: %s" % cpu_value)

def _lib_path(lib, cpu_value, basedir, version, static):
    file_name = lib_name(lib, cpu_value, version, static)
    return "%s/%s" % (basedir, file_name)

def _should_check_soname(version, static):
    return version and not static

def _check_cuda_lib_params(lib, cpu_value, basedir, version, static = False):
    return (
        _lib_path(lib, cpu_value, basedir, version, static),
        _should_check_soname(version, static),
    )

def _check_cuda_libs(repository_ctx, script_path, libs):
    python_bin = get_python_bin(repository_ctx)
    contents = repository_ctx.read(script_path).splitlines()

    cmd = "from os import linesep;"
    cmd += "f = open('script.py', 'w');"
    for line in contents:
        cmd += "f.write('%s' + linesep);" % line
    cmd += "f.close();"
    cmd += "from os import system;"
    args = " ".join(["\"" + path + "\" " + str(check) for path, check in libs])
    cmd += "system('%s script.py %s');" % (python_bin, args)

    all_paths = [path for path, _ in libs]
    checked_paths = execute(
        repository_ctx,
        [python_bin, "-c", cmd],
    ).stdout.splitlines()

    # Filter out empty lines from splitting on '\r\n' on Windows
    checked_paths = [path for path in checked_paths if len(path) > 0]
    if all_paths != checked_paths:
        auto_configure_fail(
            "Error with installed CUDA libs. Expected '%s'. Actual '%s'." %
            (all_paths, checked_paths),
        )

def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
    """Returns the CUDA and cuDNN libraries on the system.

      Also, verifies that the script actually exist.

      Args:
        repository_ctx: The repository context.
        check_cuda_libs_script: The path to a script verifying that the cuda
          libraries exist on the system.
        cuda_config: The CUDA config as returned by _get_cuda_config

      Returns:
        Map of library names to structs of filename and path.
      """
    cpu_value = cuda_config.cpu_value
    stub_dir = "/stubs"

    check_cuda_libs_params = {
        "cuda": _check_cuda_lib_params(
            "cuda",
            cpu_value,
            cuda_config.config["cuda_library_dir"] + stub_dir,
            version = None,
            static = False,
        ),
        "cudart": _check_cuda_lib_params(
            "cudart",
            cpu_value,
            cuda_config.config["cuda_library_dir"],
            cuda_config.cudart_version,
            static = False,
        ),
        "cudart_static": _check_cuda_lib_params(
            "cudart_static",
            cpu_value,
            cuda_config.config["cuda_library_dir"],
            cuda_config.cudart_version,
            static = True,
        ),
        "cublas": _check_cuda_lib_params(
            "cublas",
            cpu_value,
            cuda_config.config["cublas_library_dir"],
            cuda_config.cublas_version,
            static = False,
        ),
        "cublasLt": _check_cuda_lib_params(
            "cublasLt",
            cpu_value,
            cuda_config.config["cublas_library_dir"],
            cuda_config.cublas_version,
            static = False,
        ),
        "cusolver": _check_cuda_lib_params(
            "cusolver",
            cpu_value,
            cuda_config.config["cusolver_library_dir"],
            cuda_config.cusolver_version,
            static = False,
        ),
        "curand": _check_cuda_lib_params(
            "curand",
            cpu_value,
            cuda_config.config["curand_library_dir"],
            cuda_config.curand_version,
            static = False,
        ),
        "cufft": _check_cuda_lib_params(
            "cufft",
            cpu_value,
            cuda_config.config["cufft_library_dir"],
            cuda_config.cufft_version,
            static = False,
        ),
        "cudnn": _check_cuda_lib_params(
            "cudnn",
            cpu_value,
            cuda_config.config["cudnn_library_dir"],
            cuda_config.cudnn_version,
            static = False,
        ),
        "cupti": _check_cuda_lib_params(
            "cupti",
            cpu_value,
            cuda_config.config["cupti_library_dir"],
            cuda_config.cupti_version,
            static = False,
        ),
        "cusparse": _check_cuda_lib_params(
            "cusparse",
            cpu_value,
            cuda_config.config["cusparse_library_dir"],
            cuda_config.cusparse_version,
            static = False,
        ),
    }

    # Verify that the libs actually exist at their locations.
    _check_cuda_libs(
        repository_ctx,
        check_cuda_libs_script,
        check_cuda_libs_params.values(),
    )

    paths = {
        filename: v[0]
        for (filename, v) in check_cuda_libs_params.items()
    }
    return paths

def _cudart_static_linkopt(cpu_value):
    """Returns additional platform-specific linkopts for cudart."""
    return "" if cpu_value == "Darwin" else "\"-lrt\","

def _exec_find_cuda_config(repository_ctx, script_path, cuda_libraries):
    python_bin = get_python_bin(repository_ctx)
    cmd = "from os import system;" + "system('\"%s\" %s %s');" % (
        python_bin,
        script_path,
        " ".join(cuda_libraries),
    )
    return execute(repository_ctx, [python_bin, "-c", cmd])

# TODO(csigg): Only call once instead of from here, tensorrt_configure.bzl,
# and nccl_configure.bzl.
def find_cuda_config(repository_ctx, script_path, cuda_libraries):
    """Returns CUDA config dictionary from running find_cuda_config.py"""
    exec_result = _exec_find_cuda_config(
        repository_ctx,
        script_path,
        cuda_libraries,
    )

    if exec_result.return_code:
        auto_configure_fail("Failed to run find_cuda_config.py: %s" %
                            err_out(exec_result))

    # Parse the dict from stdout.
    return dict(
        [tuple(x.split(": ")) for x in exec_result.stdout.splitlines()],
    )

def _get_cuda_config(repository_ctx, find_cuda_config_script):
    """Detects and returns information about the CUDA installation on the system.

      Args:
        repository_ctx: The repository context.

      Returns:
        A struct containing the following fields:
          cuda_toolkit_path: The CUDA toolkit installation directory.
          cudnn_install_basedir: The cuDNN installation directory.
          cuda_version: The version of CUDA on the system.
          cudart_version: The CUDA runtime version on the system.
          cudnn_version: The version of cuDNN on the system.
          compute_capabilities: A list of the system's CUDA compute capabilities.
          cpu_value: The name of the host operating system.
      """
    config = find_cuda_config(
        repository_ctx,
        find_cuda_config_script,
        ["cuda", "cudnn"],
    )

    cpu_value = get_cpu_value(repository_ctx)
    toolkit_path = config["cuda_toolkit_path"]

    cuda_version = config["cuda_version"].split(".")
    cuda_major = cuda_version[0]
    cuda_minor = cuda_version[1]

    cuda_version = "%s.%s" % (cuda_major, cuda_minor)
    cudnn_version = "%s" % config["cudnn_version"]

    if int(cuda_major) >= 11:
        # The libcudart soname in CUDA 11.x is versioned as 11.0 for backward compatability.
        if int(cuda_major) == 11:
            cudart_version = "11.0"
            cupti_version = cuda_version
        else:
            cudart_version = ("%s") % cuda_major
            cupti_version = cudart_version
        cublas_version = ("%s") % config["cublas_version"].split(".")[0]
        cusolver_version = ("%s") % config["cusolver_version"].split(".")[0]
        curand_version = ("%s") % config["curand_version"].split(".")[0]
        cufft_version = ("%s") % config["cufft_version"].split(".")[0]
        cusparse_version = ("%s") % config["cusparse_version"].split(".")[0]
    elif (int(cuda_major), int(cuda_minor)) >= (10, 1):
        # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.
        # It changed from 'x.y' to just 'x' in CUDA 10.1.
        cuda_lib_version = ("%s") % cuda_major
        cudart_version = cuda_version
        cupti_version = cuda_version
        cublas_version = cuda_lib_version
        cusolver_version = cuda_lib_version
        curand_version = cuda_lib_version
        cufft_version = cuda_lib_version
        cusparse_version = cuda_lib_version
    else:
        cudart_version = cuda_version
        cupti_version = cuda_version
        cublas_version = cuda_version
        cusolver_version = cuda_version
        curand_version = cuda_version
        cufft_version = cuda_version
        cusparse_version = cuda_version

    return struct(
        cuda_toolkit_path = toolkit_path,
        cuda_version = cuda_version,
        cupti_version = cupti_version,
        cuda_version_major = cuda_major,
        cudart_version = cudart_version,
        cublas_version = cublas_version,
        cusolver_version = cusolver_version,
        curand_version = curand_version,
        cufft_version = cufft_version,
        cusparse_version = cusparse_version,
        cudnn_version = cudnn_version,
        compute_capabilities = compute_capabilities(repository_ctx),
        cpu_value = cpu_value,
        config = config,
    )

def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
    if not out:
        out = tpl.replace(":", "/")
    repository_ctx.template(
        out,
        Label("//build_deps/gpus/%s.tpl" % tpl),
        substitutions,
    )

def _file(repository_ctx, label):
    repository_ctx.template(
        label.replace(":", "/"),
        Label("//build_deps/gpus/%s.tpl" % label),
        {},
    )

_DUMMY_CROSSTOOL_BZL_FILE = """
def error_gpu_disabled():
  fail("ERROR: Building with --config=cuda but TensorFlow is not configured " +
       "to build with GPU support. Please re-run ./configure and enter 'Y' " +
       "at the prompt to build with GPU support.")

  native.genrule(
      name = "error_gen_crosstool",
      outs = ["CROSSTOOL"],
      cmd = "echo 'Should not be run.' && exit 1",
  )

  native.filegroup(
      name = "crosstool",
      srcs = [":CROSSTOOL"],
      output_licenses = ["unencumbered"],
  )
"""

_DUMMY_CROSSTOOL_BUILD_FILE = """
load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")

error_gpu_disabled()
"""

def _norm_path(path):
    """Returns a path with '/' and remove the trailing slash."""
    path = path.replace("\\", "/")
    if path[-1] == "/":
        path = path[:-1]
    return path

def make_copy_files_rule(repository_ctx, name, srcs, outs):
    """Returns a rule to copy a set of files."""
    cmds = []

    # Copy files.
    for src, out in zip(srcs, outs):
        cmds.append('cp -f "%s" "$(location %s)"' % (src, out))
    outs = [('        "%s",' % out) for out in outs]
    return """genrule(
    name = "%s",
    outs = [
%s
    ],
    cmd = \"""%s \""",
)""" % (name, "\n".join(outs), " && \\\n".join(cmds))

def make_copy_dir_rule(
        repository_ctx,
        name,
        src_dir,
        out_dir,
        exceptions = None):
    """Returns a rule to recursively copy a directory.
    If exceptions is not None, it must be a list of files or directories in
    'src_dir'; these will be excluded from copying.
    """
    src_dir = _norm_path(src_dir)
    out_dir = _norm_path(out_dir)
    outs = read_dir(repository_ctx, src_dir)
    post_cmd = ""
    if exceptions != None:
        outs = [
            x
            for x in outs
            if not any([x.startswith(src_dir + "/" + y) for y in exceptions])
        ]
    outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]

    # '@D' already contains the relative path for a single file, see
    # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
    out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
    if exceptions != None:
        for x in exceptions:
            post_cmd += " ; rm -fR " + out_dir + "/" + x
    return """genrule(
    name = "%s",
    outs = [
%s
    ],
    cmd = \"""cp -rLf "%s/." "%s/" %s\""",
)""" % (name, "\n".join(outs), src_dir, out_dir, post_cmd)

def _flag_enabled(repository_ctx, flag_name):
    return get_host_environ(repository_ctx, flag_name) == "1"

def _tf_sysroot(repository_ctx):
    return get_host_environ(repository_ctx, _SYSROOT, "")

def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
    copts = []
    for capability in compute_capabilities:
        if capability.startswith("compute_"):
            capability = capability.replace("compute_", "sm_")
            copts.append("--cuda-include-ptx=%s" % capability)
        copts.append("--cuda-gpu-arch=%s" % capability)

    return str(copts)

def _tpl_path(repository_ctx, filename):
    return repository_ctx.path(Label("//build_deps/gpus/%s.tpl" % filename))

def _basename(repository_ctx, path_str):
    """Returns the basename of a path of type string.
    """

    num_chars = len(path_str)
    for i in range(num_chars):
        r_i = num_chars - 1 - i
        if path_str[r_i] == "/":
            return path_str[r_i + 1:]
    return path_str

def _create_local_cuda_repository(repository_ctx):
    """Creates the repository containing files set up to build with CUDA."""
    tpl_paths = {
        filename: _tpl_path(repository_ctx, filename)
        for filename in [
            "cuda:build_defs.bzl",
            "crosstool:crosstool_compiler_wrapper",
            "crosstool:BUILD",
            "crosstool:cc_toolchain_config.bzl",
            "cuda:cuda_config.h",
            "cuda:cuda_config.py",
        ]
    }
    tpl_paths["cuda:BUILD"] = _tpl_path(repository_ctx, "cuda:BUILD")
    find_cuda_config_script = repository_ctx.path(
        Label("//build_deps/gpus:find_cuda_config.py"),
    )

    cuda_config = _get_cuda_config(repository_ctx, find_cuda_config_script)

    cuda_include_path = cuda_config.config["cuda_include_dir"]
    cublas_include_path = cuda_config.config["cublas_include_dir"]
    cudnn_header_dir = cuda_config.config["cudnn_include_dir"]
    cupti_header_dir = cuda_config.config["cupti_include_dir"]
    nvvm_libdevice_dir = cuda_config.config["nvvm_library_dir"]

    # Create genrule to copy files from the installed CUDA toolkit into execroot.
    copy_rules = [
        make_copy_dir_rule(
            repository_ctx,
            name = "cuda-include",
            src_dir = cuda_include_path,
            out_dir = "cuda/include",
        ),
        make_copy_dir_rule(
            repository_ctx,
            name = "cuda-nvvm",
            src_dir = nvvm_libdevice_dir,
            out_dir = "cuda/nvvm/libdevice",
        ),
        make_copy_dir_rule(
            repository_ctx,
            name = "cuda-extras",
            src_dir = cupti_header_dir,
            out_dir = "cuda/extras/CUPTI/include",
        ),
    ]

    copy_rules.append(
        make_copy_files_rule(
            repository_ctx,
            name = "cublas-include",
            srcs = [
                cublas_include_path + "/cublas.h",
                cublas_include_path + "/cublas_v2.h",
                cublas_include_path + "/cublas_api.h",
                cublas_include_path + "/cublasLt.h",
            ],
            outs = [
                "cublas/include/cublas.h",
                "cublas/include/cublas_v2.h",
                "cublas/include/cublas_api.h",
                "cublas/include/cublasLt.h",
            ],
        ),
    )

    cusolver_include_path = cuda_config.config["cusolver_include_dir"]
    copy_rules.append(
        make_copy_files_rule(
            repository_ctx,
            name = "cusolver-include",
            srcs = [
                cusolver_include_path + "/cusolver_common.h",
                cusolver_include_path + "/cusolverDn.h",
            ],
            outs = [
                "cusolver/include/cusolver_common.h",
                "cusolver/include/cusolverDn.h",
            ],
        ),
    )

    cufft_include_path = cuda_config.config["cufft_include_dir"]
    copy_rules.append(
        make_copy_files_rule(
            repository_ctx,
            name = "cufft-include",
            srcs = [
                cufft_include_path + "/cufft.h",
            ],
            outs = [
                "cufft/include/cufft.h",
            ],
        ),
    )

    cusparse_include_path = cuda_config.config["cusparse_include_dir"]
    copy_rules.append(
        make_copy_files_rule(
            repository_ctx,
            name = "cusparse-include",
            srcs = [
                cusparse_include_path + "/cusparse.h",
            ],
            outs = [
                "cusparse/include/cusparse.h",
            ],
        ),
    )

    curand_include_path = cuda_config.config["curand_include_dir"]
    copy_rules.append(
        make_copy_files_rule(
            repository_ctx,
            name = "curand-include",
            srcs = [
                curand_include_path + "/curand.h",
            ],
            outs = [
                "curand/include/curand.h",
            ],
        ),
    )

    check_cuda_libs_script = repository_ctx.path(
        Label("//build_deps/gpus:check_cuda_libs.py"),
    )
    cuda_libs = _find_libs(repository_ctx, check_cuda_libs_script, cuda_config)
    cuda_lib_srcs = []
    cuda_lib_outs = []
    for path in cuda_libs.values():
        cuda_lib_srcs.append(path)
        cuda_lib_outs.append("cuda/lib/" + _basename(repository_ctx, path))
    copy_rules.append(
        make_copy_files_rule(
            repository_ctx,
            name = "cuda-lib",
            srcs = cuda_lib_srcs,
            outs = cuda_lib_outs,
        ),
    )

    file_ext = ""
    bin_files = (
        ["crt/link.stub"] +
        [f + file_ext for f in ["bin2c", "fatbinary", "nvlink", "nvprune"]]
    )
    copy_rules.append(
        make_copy_files_rule(
            repository_ctx,
            name = "cuda-bin",
            srcs = [
                cuda_config.cuda_toolkit_path + "/bin/" + f
                for f in bin_files
            ],
            outs = ["cuda/bin/" + f for f in bin_files],
        ),
    )

    # Select the headers based on the cuDNN version (strip '64_' for Windows).
    cudnn_headers = ["cudnn.h"]
    if cuda_config.cudnn_version.rsplit("_", 1)[-1] >= "8":
        cudnn_headers += [
            "cudnn_backend.h",
            "cudnn_adv_infer.h",
            "cudnn_adv_train.h",
            "cudnn_cnn_infer.h",
            "cudnn_cnn_train.h",
            "cudnn_ops_infer.h",
            "cudnn_ops_train.h",
            "cudnn_version.h",
        ]

    cudnn_srcs = []
    cudnn_outs = []
    for header in cudnn_headers:
        cudnn_srcs.append(cudnn_header_dir + "/" + header)
        cudnn_outs.append("cudnn/include/" + header)

    copy_rules.append(
        make_copy_files_rule(
            repository_ctx,
            name = "cudnn-include",
            srcs = cudnn_srcs,
            outs = cudnn_outs,
        ),
    )

    # Set up BUILD file for cuda/
    repository_ctx.template(
        "cuda/build_defs.bzl",
        tpl_paths["cuda:build_defs.bzl"],
        {
            "%{cuda_is_configured}": "True",
            "%{cuda_extra_copts}": _compute_cuda_extra_copts(
                repository_ctx,
                cuda_config.compute_capabilities,
            ),
            "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities),
        },
    )

    cub_actual = "@cub_archive//:cub"
    if int(cuda_config.cuda_version_major) >= 11:
        cub_actual = ":cuda_headers"

    repository_ctx.template(
        "cuda/BUILD",
        tpl_paths["cuda:BUILD"],
        {
            "%{cuda_driver_lib}": _basename(repository_ctx, cuda_libs["cuda"]),
            "%{cudart_static_lib}": _basename(repository_ctx, cuda_libs["cudart_static"]),
            "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
            "%{cudart_lib}": _basename(repository_ctx, cuda_libs["cudart"]),
            "%{cublas_lib}": _basename(repository_ctx, cuda_libs["cublas"]),
            "%{cublasLt_lib}": _basename(repository_ctx, cuda_libs["cublasLt"]),
            "%{cusolver_lib}": _basename(repository_ctx, cuda_libs["cusolver"]),
            "%{cudnn_lib}": _basename(repository_ctx, cuda_libs["cudnn"]),
            "%{cufft_lib}": _basename(repository_ctx, cuda_libs["cufft"]),
            "%{curand_lib}": _basename(repository_ctx, cuda_libs["curand"]),
            "%{cupti_lib}": _basename(repository_ctx, cuda_libs["cupti"]),
            "%{cusparse_lib}": _basename(repository_ctx, cuda_libs["cusparse"]),
            "%{cub_actual}": cub_actual,
            "%{copy_rules}": "\n".join(copy_rules),
        },
    )

    tf_sysroot = _tf_sysroot(repository_ctx)

    # Set up crosstool/
    cc = find_cc(repository_ctx)
    cc_fullpath = cc

    host_compiler_includes = get_cxx_inc_directories(
        repository_ctx,
        cc_fullpath,
        tf_sysroot,
    )
    cuda_defines = {}
    cuda_defines["%{builtin_sysroot}"] = tf_sysroot
    cuda_defines["%{cuda_toolkit_path}"] = ""
    cuda_defines["%{compiler}"] = "unknown"

    host_compiler_prefix = get_host_environ(
        repository_ctx,
        _GCC_HOST_COMPILER_PREFIX,
    )
    if not host_compiler_prefix:
        host_compiler_prefix = "/usr/bin"

    cuda_defines["%{host_compiler_prefix}"] = host_compiler_prefix
    cuda_defines["%{linker_bin_path}"] = host_compiler_prefix
    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
    cuda_defines["%{unfiltered_compile_flags}"] = ""

    cuda_defines["%{host_compiler_path}"] = "crosstool_compiler_wrapper"
    cuda_defines["%{host_compiler_warnings}"] = ""

    # nvcc has the system include paths built in and will automatically
    # search them; we cannot work around that, so we add the relevant cuda
    # system paths to the allowed compiler specific include paths.
    cuda_defines["%{cxx_builtin_include_directories}"] = to_list_of_strings(
        host_compiler_includes + _cuda_include_path(
            repository_ctx,
            cuda_config,
        ) + [cupti_header_dir, cudnn_header_dir],
    )

    # For gcc, do not canonicalize system header paths; some versions of gcc
    # pick the shortest possible path for system includes when creating the
    # .d file - given that includes that are prefixed with "../" multiple
    # time quickly grow longer than the root of the tree, this can lead to
    # bazel's header check failing.
    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = "\"-fno-canonical-system-headers\""

    file_ext = ""
    nvcc_path = "%s/nvcc%s" % (cuda_config.config["cuda_binary_dir"], file_ext)
    cuda_defines["%{compiler_deps}"] = ":crosstool_compiler"

    wrapper_defines = {
        "%{cpu_compiler}": str(cc),
        "%{cuda_version}": cuda_config.cuda_version,
        "%{nvcc_path}": nvcc_path,
        "%{gcc_host_compiler_path}": str(cc),
    }
    repository_ctx.template(
        "crosstool/crosstool_compiler_wrapper",
        tpl_paths["crosstool:crosstool_compiler_wrapper"],
        wrapper_defines,
    )

    verify_build_defines(cuda_defines)

    # Only expand template variables in the BUILD file
    repository_ctx.template(
        "crosstool/BUILD",
        tpl_paths["crosstool:BUILD"],
        cuda_defines,
    )

    # No templating of cc_toolchain_config - use attributes and templatize the
    # BUILD file.
    repository_ctx.template(
        "crosstool/cc_toolchain_config.bzl",
        tpl_paths["crosstool:cc_toolchain_config.bzl"],
        {},
    )

    # Set up cuda_config.h
    repository_ctx.template(
        "cuda/cuda/cuda_config.h",
        tpl_paths["cuda:cuda_config.h"],
        {
            "%{cuda_version}": cuda_config.cuda_version,
            "%{cudart_version}": cuda_config.cudart_version,
            "%{cupti_version}": cuda_config.cupti_version,
            "%{cublas_version}": cuda_config.cublas_version,
            "%{cusolver_version}": cuda_config.cusolver_version,
            "%{curand_version}": cuda_config.curand_version,
            "%{cufft_version}": cuda_config.cufft_version,
            "%{cusparse_version}": cuda_config.cusparse_version,
            "%{cudnn_version}": cuda_config.cudnn_version,
            "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
            "%{cuda_compute_capabilities}": ", ".join(
                [cc.split("_")[1] for cc in cuda_config.compute_capabilities],
            ),
        },
    )

    # Set up cuda_config.py, which is used by gen_build_info to provide
    # static build environment info to the API
    repository_ctx.template(
        "cuda/cuda/cuda_config.py",
        tpl_paths["cuda:cuda_config.py"],
        _py_tmpl_dict({
            "cuda_version": cuda_config.cuda_version,
            "cudnn_version": cuda_config.cudnn_version,
            "cuda_compute_capabilities": cuda_config.compute_capabilities,
            "cpu_compiler": str(cc),
        }),
    )

def _get_tensorrt_static_path(repository_ctx):
    return get_host_environ(repository_ctx, _TENSORRT_STATIC_PATH, None)

def _create_local_tensorrt_repository(repository_ctx):
    find_cuda_config_path = repository_ctx.path(
        Label("//build_deps/gpus:find_cuda_config.py"),
    )
    config = find_cuda_config(
        repository_ctx,
        find_cuda_config_path,
        ["tensorrt"],
    )
    tensorrt_version = config["tensorrt_version"]
    cpu_value = get_cpu_value(repository_ctx)

    # Copy the library and header files
    libraries = [
        lib_name(lib, cpu_value, tensorrt_version)
        for lib in _TENSORRT_LIBS
    ]
    library_dir = config["tensorrt_library_dir"] + "/"
    headers = _get_tensorrt_headers(tensorrt_version)
    include_dir = config["tensorrt_include_dir"] + "/"
    copy_rules = [
        make_copy_files_rule(
            repository_ctx,
            name = "tensorrt_lib",
            srcs = [library_dir + library for library in libraries],
            outs = ["tensorrt/lib/" + library for library in libraries],
        ),
        make_copy_files_rule(
            repository_ctx,
            name = "tensorrt_include",
            srcs = [include_dir + header for header in headers],
            outs = ["tensorrt/include/" + header for header in headers],
        ),
    ]

    tensorrt_static_path = _get_tensorrt_static_path(repository_ctx)
    if tensorrt_static_path:
        tensorrt_static_path = tensorrt_static_path + "/"
        if _at_least_version(tensorrt_version, "8"):
            raw_static_library_names = _TENSORRT_LIBS
        else:
            raw_static_library_names = _TENSORRT_LIBS + [
                "nvrtc",
                "myelin_compiler",
                "myelin_executor",
                "myelin_pattern_library",
                "myelin_pattern_runtime",
            ]

        static_library_names = [
            "%s_static" % name
            for name in raw_static_library_names
        ]
        static_libraries = [
            lib_name(lib, cpu_value, tensorrt_version, static = True)
            for lib in static_library_names
        ]
        copy_rules = copy_rules + [
            make_copy_files_rule(
                repository_ctx,
                name = "tensorrt_static_lib",
                srcs = [
                    tensorrt_static_path + library
                    for library in static_libraries
                ],
                outs = [
                    "tensorrt/lib/" + library
                    for library in static_libraries
                ],
            ),
        ]

    tpl_paths = {
        "tensorrt/build_defs.bzl": _tpl_path(repository_ctx, "tensorrt:build_defs.bzl"),
        "tensorrt/BUILD": _tpl_path(repository_ctx, "tensorrt:BUILD"),
        "tensorrt/tensorrt_config.h": _tpl_path(repository_ctx, "tensorrt:tensorrt_config.h"),
        "tensorrt/tensorrt_config.py": _tpl_path(repository_ctx, "tensorrt:tensorrt_config.py"),
    }

    # Set up config file.
    repository_ctx.template(
        "tensorrt/build_defs.bzl",
        tpl_paths["tensorrt/build_defs.bzl"],
        {"%{if_tensorrt}": "if_true"},
    )

    # Set up BUILD file.
    repository_ctx.template(
        "tensorrt/BUILD",
        tpl_paths["tensorrt/BUILD"],
        {
            "%{copy_rules}": "\n".join(copy_rules),
        },
    )

    # Set up tensorrt_config.h, which is used by
    # tensorflow/stream_executor/dso_loader.cc.
    repository_ctx.template(
        "tensorrt/tensorrt_config.h",
        tpl_paths["tensorrt/tensorrt_config.h"],
        {"%{tensorrt_version}": tensorrt_version},
    )

    # Set up tensorrt_config.py, which is used by gen_build_info to provide
    # build environment info to the API
    repository_ctx.template(
        "tensorrt/tensorrt_config.py",
        tpl_paths["tensorrt/tensorrt_config.py"],
        _py_tmpl_dict({
            "tensorrt_version": tensorrt_version,
        }),
    )

def _py_tmpl_dict(d):
    return {"%{cuda_config}": str(d)}

_CUDA_ENVIRONS = [
    _GCC_HOST_COMPILER_PATH,
    _GCC_HOST_COMPILER_PREFIX,
    "NEED_CUDA",
    _CUDA_TOOLKIT_PATH,
    _CUDNN_INSTALL_PATH,
    _CUDA_VERSION,
    _CUDNN_VERSION,
    _CUDA_COMPUTE_CAPABILITIES,
    "NVVMIR_LIBRARY_DIR",
    _PYTHON_BIN_PATH,
    "TMP",
    "TMPDIR",
    "CUDA_PATHS",
]

cuda_configure = repository_rule(
    implementation = _create_local_cuda_repository,
    environ = _CUDA_ENVIRONS,
)

_TENSORRT_ENVIRONS = [
    _TENSORRT_INSTALL_PATH,
    _TENSORRT_VERSION,
    _TENSORRT_STATIC_PATH,
    "CUDA_PATHS",
]

tensorrt_configure = repository_rule(
    implementation = _create_local_tensorrt_repository,
    environ = _TENSORRT_ENVIRONS,
)


================================================
FILE: build_deps/gpus/crosstool/BUILD
================================================


================================================
FILE: build_deps/gpus/crosstool/BUILD.tpl
================================================
# This file is expanded from a template by cuda_configure.bzl
# Update cuda_configure.bzl#verify_build_defines when adding new variables.

load(":cc_toolchain_config.bzl", "cc_toolchain_config")

licenses(["restricted"])

package(default_visibility = ["//visibility:public"])

toolchain(
    name = "toolchain-linux-x86_64",
    exec_compatible_with = [
        "@platforms//os:linux",
        "@platforms//cpu:x86_64",
    ],
    target_compatible_with = [
        "@platforms//os:linux",
        "@platforms//cpu:x86_64",
    ],
    toolchain = ":cc-compiler-local",
    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
)

cc_toolchain_suite(
    name = "toolchain",
    toolchains = {
        "local|compiler": ":cc-compiler-local",
        "darwin|compiler": ":cc-compiler-darwin",
        "arm": ":cc-compiler-local",
        "aarch64": ":cc-compiler-local",
        "k8": ":cc-compiler-local",
        "piii": ":cc-compiler-local",
        "ppc": ":cc-compiler-local",
        "darwin": ":cc-compiler-darwin",
    },
)

cc_toolchain(
    name = "cc-compiler-local",
    all_files = "%{compiler_deps}",
    compiler_files = "%{compiler_deps}",
    ar_files = "%{compiler_deps}",
    as_files = "%{compiler_deps}",
    dwp_files = ":empty",
    linker_files = "%{compiler_deps}",
    objcopy_files = ":empty",
    strip_files = ":empty",
    # To support linker flags that need to go to the start of command line
    # we need the toolchain to support parameter files. Parameter files are
    # last on the command line and contain all shared libraries to link, so all
    # regular options will be left of them.
    supports_param_files = 1,
    toolchain_identifier = "local_linux",
    toolchain_config = ":cc-compiler-local-config",
)

cc_toolchain_config(
    name = "cc-compiler-local-config",
    cpu = "local",
    builtin_include_directories = [%{cxx_builtin_include_directories}],
    extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}],
    host_compiler_path = "%{host_compiler_path}",
    host_compiler_prefix = "%{host_compiler_prefix}",
    host_compiler_warnings = [%{host_compiler_warnings}],
    host_unfiltered_compile_flags = [%{unfiltered_compile_flags}],
    linker_bin_path = "%{linker_bin_path}",
    builtin_sysroot = "%{builtin_sysroot}",
    cuda_path = "%{cuda_toolkit_path}",
    compiler = "%{compiler}",
)

cc_toolchain(
    name = "cc-compiler-darwin",
    all_files = "%{compiler_deps}",
    compiler_files = "%{compiler_deps}",
    ar_files = "%{compiler_deps}",
    as_files = "%{compiler_deps}",
    dwp_files = ":empty",
    linker_files = "%{compiler_deps}",
    objcopy_files = ":empty",
    strip_files = ":empty",
    supports_param_files = 0,
    toolchain_identifier = "local_darwin",
    toolchain_config = ":cc-compiler-local-darwin",
)

cc_toolchain_config(
    name = "cc-compiler-local-darwin",
    cpu = "darwin",
    builtin_include_directories = [%{cxx_builtin_include_directories}],
    extra_no_canonical_prefixes_flags = [%{extra_no_canonical_prefixes_flags}],
    host_compiler_path = "%{host_compiler_path}",
    host_compiler_prefix = "%{host_compiler_prefix}",
    host_compiler_warnings = [%{host_compiler_warnings}],
    host_unfiltered_compile_flags = [%{unfiltered_compile_flags}],
    linker_bin_path = "%{linker_bin_path}",
)


filegroup(
    name = "empty",
    srcs = [],
)

filegroup(
    name = "crosstool_compiler",
    srcs = ["crosstool_compiler_wrapper"],
)


================================================
FILE: build_deps/gpus/crosstool/cc_toolchain_config.bzl.tpl
================================================
"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""

load(
    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
    "action_config",
    "artifact_name_pattern",
    "env_entry",
    "env_set",
    "feature",
    "feature_set",
    "flag_group",
    "flag_set",
    "tool",
    "tool_path",
    "variable_with_value",
    "with_feature_set",
)
load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")

def all_assembly_actions():
    return [
        ACTION_NAMES.assemble,
        ACTION_NAMES.preprocess_assemble,
    ]

def all_compile_actions():
    return [
        ACTION_NAMES.assemble,
        ACTION_NAMES.c_compile,
        ACTION_NAMES.cpp_compile,
        ACTION_NAMES.cpp_header_parsing,
        ACTION_NAMES.cpp_module_codegen,
        ACTION_NAMES.cpp_module_compile,
        ACTION_NAMES.linkstamp_compile,
        ACTION_NAMES.preprocess_assemble,
    ]

def all_c_compile_actions():
    return [
        ACTION_NAMES.c_compile,
    ]

def all_cpp_compile_actions():
    return [
        ACTION_NAMES.cpp_compile,
        ACTION_NAMES.cpp_header_parsing,
        ACTION_NAMES.cpp_module_codegen,
        ACTION_NAMES.cpp_module_compile,
        ACTION_NAMES.linkstamp_compile,
    ]

def all_preprocessed_actions():
    return [
        ACTION_NAMES.c_compile,
        ACTION_NAMES.cpp_compile,
        ACTION_NAMES.cpp_header_parsing,
        ACTION_NAMES.cpp_module_codegen,
        ACTION_NAMES.cpp_module_compile,
        ACTION_NAMES.linkstamp_compile,
        ACTION_NAMES.preprocess_assemble,
    ]

def all_link_actions():
    return [
        ACTION_NAMES.cpp_link_executable,
        ACTION_NAMES.cpp_link_dynamic_library,
        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
    ]

def all_executable_link_actions():
    return [
        ACTION_NAMES.cpp_link_executable,
    ]

def all_shared_library_link_actions():
    return [
        ACTION_NAMES.cpp_link_dynamic_library,
        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
    ]

def all_archive_actions():
    return [ACTION_NAMES.cpp_link_static_library]

def all_strip_actions():
    return [ACTION_NAMES.strip]

def _library_to_link(flag_prefix, value, iterate = None):
    return flag_group(
        flags = [
            "{}%{{libraries_to_link.{}}}".format(
                flag_prefix,
                iterate if iterate else "name",
            ),
        ],
        iterate_over = ("libraries_to_link." + iterate if iterate else None),
        expand_if_equal = variable_with_value(
            name = "libraries_to_link.type",
            value = value,
        ),
    )

def _surround_static_library(prefix, suffix):
    return [
        flag_group(
            flags = [prefix, "%{libraries_to_link.name}", suffix],
            expand_if_true = "libraries_to_link.is_whole_archive",
        ),
        flag_group(
            flags = ["%{libraries_to_link.name}"],
            expand_if_false = "libraries_to_link.is_whole_archive",
        ),
    ]

def _prefix_static_library(prefix):
    return [
        flag_group(
            flags = ["%{libraries_to_link.name}"],
            expand_if_false = "libraries_to_link.is_whole_archive",
        ),
        flag_group(
            flags = [prefix + "%{libraries_to_link.name}"],
            expand_if_true = "libraries_to_link.is_whole_archive",
        ),
    ]

def _static_library_to_link(alwayslink_prefix, alwayslink_suffix = None):
    if alwayslink_suffix:
        flag_groups = _surround_static_library(alwayslink_prefix, alwayslink_suffix)
    else:
        flag_groups = _prefix_static_library(alwayslink_prefix)
    return flag_group(
        flag_groups = flag_groups,
        expand_if_equal = variable_with_value(
            name = "libraries_to_link.type",
            value = "static_library",
        ),
    )

def _iterate_flag_group(iterate_over, flags = [], flag_groups = []):
    return flag_group(
        iterate_over = iterate_over,
        expand_if_available = iterate_over,
        flag_groups = flag_groups,
        flags = flags,
    )

def _libraries_to_link_group(flavour):
    if flavour == "linux":
        return _iterate_flag_group(
            iterate_over = "libraries_to_link",
            flag_groups = [
                flag_group(
                    flags = ["-Wl,--start-lib"],
                    expand_if_equal = variable_with_value(
                        name = "libraries_to_link.type",
                        value = "object_file_group",
                    ),
                ),
                _library_to_link("", "object_file_group", "object_files"),
                flag_group(
                    flags = ["-Wl,--end-lib"],
                    expand_if_equal = variable_with_value(
                        name = "libraries_to_link.type",
                        value = "object_file_group",
                    ),
                ),
                _library_to_link("", "object_file"),
                _library_to_link("", "interface_library"),
                _static_library_to_link("-Wl,-whole-archive", "-Wl,-no-whole-archive"),
                _library_to_link("-l", "dynamic_library"),
                _library_to_link("-l:", "versioned_dynamic_library"),
            ],
        )
    elif flavour == "darwin":
        return _iterate_flag_group(
            iterate_over = "libraries_to_link",
            flag_groups = [
                _library_to_link("", "object_file_group", "object_files"),
                _library_to_link("", "object_file"),
                _library_to_link("", "interface_library"),
                _static_library_to_link("-Wl,-force_load,"),
                _library_to_link("-l", "dynamic_library"),
                _library_to_link("-l:", "versioned_dynamic_library"),
            ],
        )

def _action_configs_with_tool(path, actions):
    return [
        action_config(
            action_name = name,
            enabled = True,
            tools = [tool(path = path)],
        )
        for name in actions
    ]

def _action_configs(assembly_path, c_compiler_path, cc_compiler_path, archiver_path, linker_path, strip_path):
    return _action_configs_with_tool(
        assembly_path,
        all_assembly_actions(),
    ) + _action_configs_with_tool(
        c_compiler_path,
        all_c_compile_actions(),
    ) + _action_configs_with_tool(
        cc_compiler_path,
        all_cpp_compile_actions(),
    ) + _action_configs_with_tool(
        archiver_path,
        all_archive_actions(),
    ) + _action_configs_with_tool(
        linker_path,
        all_link_actions(),
    ) + _action_configs_with_tool(
        strip_path,
        all_strip_actions(),
    )

def _tool_paths(cpu, ctx):
    if cpu in ["local", "darwin"]:
        return [
            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + (
                "/ar" if cpu == "local" else "/libtool"
            )),
            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
        ]
    else:
        fail("Unreachable")

def _sysroot_group():
    return flag_group(
        flags = ["--sysroot=%{sysroot}"],
        expand_if_available = "sysroot",
    )

def _no_canonical_prefixes_group(extra_flags):
    return flag_group(
        flags = [
            "-no-canonical-prefixes",
        ] + extra_flags,
    )

def _cuda_set(cuda_path, actions):
    if cuda_path:
        return [flag_set(
            actions = actions,
            flag_groups = [
                flag_group(
                    flags = ["--cuda-path=" + cuda_path],
                ),
            ],
        )]
    else:
        return []

def _nologo():
    return flag_group(flags = ["/nologo"])

def _features(cpu, compiler, ctx):
    if cpu in ["local", "darwin"]:
        return [
            feature(name = "no_legacy_features"),
            feature(
                name = "all_compile_flags",
                enabled = True,
                flag_sets = [
                    flag_set(
                        actions = all_compile_actions(),
                        flag_groups = [
                            flag_group(
                                flags = ["-MD", "-MF", "%{dependency_file}"],
                                expand_if_available = "dependency_file",
                            ),
                            flag_group(
                                flags = ["-gsplit-dwarf"],
                                expand_if_available = "per_object_debug_info_file",
                            ),
                        ],
                    ),
                    flag_set(
                        actions = all_preprocessed_actions(),
                        flag_groups = [
                            flag_group(
                                flags = ["-frandom-seed=%{output_file}"],
                                expand_if_available = "output_file",
                            ),
                            _iterate_flag_group(
                                flags = ["-D%{preprocessor_defines}"],
                                iterate_over = "preprocessor_defines",
                            ),
                            _iterate_flag_group(
                                flags = ["-include", "%{includes}"],
                                iterate_over = "includes",
                            ),
                            _iterate_flag_group(
                                flags = ["-iquote", "%{quote_include_paths}"],
                                iterate_over = "quote_include_paths",
                            ),
                            _iterate_flag_group(
                                flags = ["-I%{include_paths}"],
                                iterate_over = "include_paths",
                            ),
                            _iterate_flag_group(
                                flags = ["-isystem", "%{system_include_paths}"],
                                iterate_over = "system_include_paths",
                            ),
                            _iterate_flag_group(
                                flags = ["-F", "%{framework_include_paths}"],
                                iterate_over = "framework_include_paths",
                            ),
                        ],
                    ),
                    flag_set(
                        actions = all_cpp_compile_actions(),
                        flag_groups = [],
                    ),
                    flag_set(
                        actions = all_compile_actions(),
                        flag_groups = [
                            flag_group(
                                flags = [
                                    "-Wno-builtin-macro-redefined",
                                    "-D__DATE__=\"redacted\"",
                                    "-D__TIMESTAMP__=\"redacted\"",
                                    "-D__TIME__=\"redacted\"",
                                ],
                            ),
                            flag_group(
                                flags = ["-fPIC"],
                                expand_if_available = "pic",
                            ),
                            flag_group(
                                flags = ["-fPIE"],
                                expand_if_not_available = "pic",
                            ),
                            flag_group(
                                flags = [
                                    "-U_FORTIFY_SOURCE",
                                    "-D_FORTIFY_SOURCE=1",
                                    "-fstack-protector",
                                    "-Wall",
                                ] + ctx.attr.host_compiler_warnings + [
                                    "-fno-omit-frame-pointer",
                                ],
                            ),
                            _no_canonical_prefixes_group(
                                ctx.attr.extra_no_canonical_prefixes_flags,
                            ),
                        ],
                    ),
                    flag_set(
                        actions = all_compile_actions(),
                        flag_groups = [flag_group(flags = ["-DNDEBUG"])],
                        with_features = [with_feature_set(features = ["disable-assertions"])],
                    ),
                    flag_set(
                        actions = all_compile_actions(),
                        flag_groups = [
                            flag_group(
                                flags = [
                                    "-g0",
                                    "-O2",
                                    "-ffunction-sections",
                                    "-fdata-sections",
                                ],
                            ),
                        ],
                        with_features = [with_feature_set(features = ["opt"])],
                    ),
                    flag_set(
                        actions = all_compile_actions(),
                        flag_groups = [flag_group(flags = ["-g"])],
                        with_features = [with_feature_set(features = ["dbg"])],
                    ),
                ] + _cuda_set(
                    ctx.attr.cuda_path,
                    all_compile_actions(),
                ) + [
                    flag_set(
                        actions = all_compile_actions(),
                        flag_groups = [
                            _iterate_flag_group(
                                flags = ["%{user_compile_flags}"],
                                iterate_over = "user_compile_flags",
                            ),
                            _sysroot_group(),
                            flag_group(
                                expand_if_available = "source_file",
                                flags = ["-c", "%{source_file}"],
                            ),
                            flag_group(
                                expand_if_available = "output_assembly_file",
                                flags = ["-S"],
                            ),
                            flag_group(
                                expand_if_available = "output_preprocess_file",
                                flags = ["-E"],
                            ),
                            flag_group(
                                expand_if_available = "output_file",
                                flags = ["-o", "%{output_file}"],
                            ),
                        ],
                    ),
                ],
            ),
            feature(
                name = "all_archive_flags",
                enabled = True,
                flag_sets = [
                    flag_set(
                        actions = all_archive_actions(),
                        flag_groups = [
                            flag_group(
                                expand_if_available = "linker_param_file",
                                flags = ["@%{linker_param_file}"],
                            ),
                            flag_group(flags = ["rcsD"]),
                            flag_group(
                                flags = ["%{output_execpath}"],
                                expand_if_available = "output_execpath",
                            ),
                            flag_group(
                                iterate_over = "libraries_to_link",
                                flag_groups = [
                                    flag_group(
                                        flags = ["%{libraries_to_link.name}"],
                                        expand_if_equal = variable_with_value(
                                            name = "libraries_to_link.type",
                                            value = "object_file",
                                        ),
                                    ),
                                    flag_group(
                                        flags = ["%{libraries_to_link.object_files}"],
                                        iterate_over = "libraries_to_link.object_files",
                                        expand_if_equal = variable_with_value(
                                            name = "libraries_to_link.type",
                                            value = "object_file_group",
                                        ),
                                    ),
                                ],
                                expand_if_available = "libraries_to_link",
                            ),
                        ],
                    ),
                ],
            ),
            feature(
                name = "all_link_flags",
                enabled = True,
                flag_sets = [
                    flag_set(
                        actions = all_shared_library_link_actions(),
                        flag_groups = [flag_group(flags = ["-shared"])],
                    ),
                    flag_set(
                        actions = all_link_actions(),
                        flag_groups = ([
                            flag_group(flags = ["-Wl,-no-as-needed"])
                        ] if cpu == "local" else []) + ([
                            flag_group(flags = ["-B" + ctx.attr.linker_bin_path])
                        ] if ctx.attr.linker_bin_path else []) + [
                            flag_group(
                                flags = ["@%{linker_param_file}"],
                                expand_if_available = "linker_param_file",
                            ),
                            _iterate_flag_group(
                                flags = ["%{linkstamp_paths}"],
                                iterate_over = "linkstamp_paths",
                            ),
                            flag_group(
                                flags = ["-o", "%{output_execpath}"],
                                expand_if_available = "output_execpath",
                            ),
                            _iterate_flag_group(
                                flags = ["-L%{library_search_directories}"],
                                iterate_over = "library_search_directories",
                            ),
                            _iterate_flag_group(
                                iterate_over = "runtime_library_search_directories",
                                flags = [
                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
                                ] if cpu == "local" else [
                                    "-Wl,-rpath,@loader_path/%{runtime_library_search_directories}",
                                ],
                            ),
                            _libraries_to_link_group("darwin" if cpu == "darwin" else "linux"),
                            _iterate_flag_group(
                                flags = ["%{user_link_flags}"],
                                iterate_over = "user_link_flags",
                            ),
                            flag_group(
                                flags = ["-Wl,--gdb-index"],
                                expand_if_available = "is_using_fission",
                            ),
                            flag_group(
                                flags = ["-Wl,-S"],
                                expand_if_available = "strip_debug_symbols",
                            ),
                            flag_group(flags = ["-lc++" if cpu == "darwin" else "-lstdc++"]),
                            _no_canonical_prefixes_group(
                                ctx.attr.extra_no_canonical_prefixes_flags,
                            ),
                        ],
                    ),
                    flag_set(
                        actions = all_executable_link_actions(),
                        flag_groups = [flag_group(flags = ["-pie"])],
                    ),
                ] + ([
                    flag_set(
                        actions = all_link_actions(),
                        flag_groups = [flag_group(flags = [
                            "-Wl,-z,relro,-z,now",
                        ])],
                    ),
                ] if cpu == "local" else []) + ([
                    flag_set(
                        actions = all_link_actions(),
                        flag_groups = [
                            flag_group(flags = ["-Wl,--gc-sections"]),
                            flag_group(
                                flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
                            ),
                        ],
                    ),
                ] if cpu == "local" else []) + ([
                    flag_set(
                        actions = all_link_actions(),
                        flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
                    ),
                ] if cpu == "darwin" else []) + _cuda_set(
                    ctx.attr.cuda_path,
                    all_link_actions(),
                ) + [
                    flag_set(
                        actions = all_link_actions(),
                        flag_groups = [
                            _sysroot_group(),
                        ],
                    ),
                ],
            ),
            feature(name = "disable-assertions"),
            feature(
                name = "opt",
                implies = ["disable-assertions"],
            ),
            feature(name = "fastbuild"),
            feature(name = "dbg"),
            feature(name = "supports_dynamic_linker", enabled = True),
            feature(name = "pic", enabled = True),
            feature(name = "supports_pic", enabled = True),
            feature(name = "has_configured_linker_path", enabled = True),
        ]
    else:
        fail("Unreachable")

def _impl(ctx):
    cpu = ctx.attr.cpu
    compiler = ctx.attr.compiler

    if (cpu == "darwin"):
        toolchain_identifier = "local_darwin"
        target_cpu = "darwin"
        target_libc = "macosx"
        compiler = "compiler"
        action_configs = _action_configs(
            assembly_path = ctx.attr.host_compiler_path,
            c_compiler_path = ctx.attr.host_compiler_path,
            cc_compiler_path = ctx.attr.host_compiler_path,
            archiver_path = ctx.attr.host_compiler_prefix + "/libtool",
            linker_path = ctx.attr.host_compiler_path,
            strip_path = ctx.attr.host_compiler_prefix + "/strip",
        )
        artifact_name_patterns = []
    elif (cpu == "local"):
        toolchain_identifier = "local_linux"
        target_cpu = "local"
        target_libc = "local"
        action_configs = _action_configs(
            assembly_path = ctx.attr.host_compiler_path,
            c_compiler_path = ctx.attr.host_compiler_path,
            cc_compiler_path = ctx.attr.host_compiler_path,
            archiver_path = ctx.attr.host_compiler_prefix + "/ar",
            linker_path = ctx.attr.host_compiler_path,
            strip_path = ctx.attr.host_compiler_prefix + "/strip",
        )
        artifact_name_patterns = []
    else:
        fail("Unreachable")

    out = ctx.actions.declare_file(ctx.label.name)
    ctx.actions.write(out, "Fake executable")
    return [
        cc_common.create_cc_toolchain_config_info(
            ctx = ctx,
            features = _features(cpu, compiler, ctx),
            action_configs = action_configs,
            artifact_name_patterns = artifact_name_patterns,
            cxx_builtin_include_directories = ctx.attr.builtin_include_directories,
            toolchain_identifier = toolchain_identifier,
            host_system_name = "local",
            target_system_name = "local",
            target_cpu = target_cpu,
            target_libc = target_libc,
            compiler = compiler,
            abi_version = "local",
            abi_libc_version = "local",
            tool_paths = _tool_paths(cpu, ctx),
            make_variables = [],
            builtin_sysroot = ctx.attr.builtin_sysroot,
            cc_target_os = None,
        ),
        DefaultInfo(
            executable = out,
        ),
    ]

cc_toolchain_config = rule(
    implementation = _impl,
    attrs = {
        "cpu": attr.string(mandatory = True, values = ["darwin", "local"]),
        "compiler": attr.string(values = ["unknown"], default = "unknown"),
        "builtin_include_directories": attr.string_list(),
        "extra_no_canonical_prefixes_flags": attr.string_list(),
        "host_compiler_path": attr.string(),
        "host_compiler_prefix": attr.string(),
        "host_compiler_warnings": attr.string_list(),
        "host_unfiltered_compile_flags": attr.string_list(),
        "linker_bin_path": attr.string(),
        "builtin_sysroot": attr.string(),
        "cuda_path": attr.string(),
    },
    provides = [CcToolchainConfigInfo],
    executable = True,
)


================================================
FILE: build_deps/gpus/crosstool/crosstool_compiler_wrapper.tpl
================================================
#!/usr/bin/env python
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Crosstool wrapper for compiling CUDA programs.

SYNOPSIS:
  crosstool_compiler_wrapper [options passed in by cc_library()
                                or cc_binary() rule]

DESCRIPTION:
  This script is expected to be called by the cc_library() or cc_binary() bazel
  rules. When the option "-x cuda" is present in the list of arguments passed
  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
  arguments as is.
"""

__author__ = 'keveman@google.com (Manjunath Kudlur)'

import os
import pipes
import re
import subprocess
import sys
from argparse import ArgumentParser

# Template values set by cuda_autoconf.
CPU_COMPILER = ('%{cpu_compiler}')
GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')

NVCC_PATH = '%{nvcc_path}'
PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
NVCC_VERSION = '%{cuda_version}'


def Log(s):
    print('gpus/crosstool: {0}'.format(s))


def GetOptionValue(argv, option):
    """Extract the list of values for option from the argv list.

  Args:
    argv: A list of strings, possibly the argv passed to main().
    option: The option whose value to extract, with the leading '-'.

  Returns:
    A list of values, either directly following the option,
    (eg., -opt val1 val2) or values collected from multiple occurrences of
    the option (eg., -opt val1 -opt val2).
  """

    parser = ArgumentParser()
    parser.add_argument(option, nargs='*', action='append')
    option = option.lstrip('-').replace('-', '_')
    args, _ = parser.parse_known_args(argv)
    if not args or not vars(args)[option]:
        return []
    else:
        return sum(vars(args)[option], [])


def GetHostCompilerOptions(argv):
    """Collect the -isystem, -iquote, and --sysroot option values from argv.

  Args:
    argv: A list of strings, possibly the argv passed to main().

  Returns:
    The string that can be used as the --compiler-options to nvcc.
  """

    parser = ArgumentParser()
    parser.add_argument('-isystem', nargs='*', action='append')
    parser.add_argument('-iquote', nargs='*', action='append')
    parser.add_argument('--sysroot', nargs=1)
    parser.add_argument('-g', nargs='*', action='append')
    parser.add_argument('-fno-canonical-system-headers', action='store_true')
    parser.add_argument('-no-canonical-prefixes', action='store_true')

    args, _ = parser.parse_known_args(argv)

    opts = ''

    if args.isystem:
        opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
    if args.iquote:
        opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
    if args.g:
        opts += ' -g' + ' -g'.join(sum(args.g, []))
    if args.fno_canonical_system_headers:
        opts += ' -fno-canonical-system-headers'
    if args.no_canonical_prefixes:
        opts += ' -no-canonical-prefixes'
    if args.sysroot:
        opts += ' --sysroot ' + args.sysroot[0]

    return opts


def _update_options(nvcc_options):
    if NVCC_VERSION in ("7.0", ):
        return nvcc_options

    update_options = {"relaxed-constexpr": "expt-relaxed-constexpr"}
    return [
        update_options[opt] if opt in update_options else opt
        for opt in nvcc_options
    ]


def GetNvccOptions(argv):
    """Collect the -nvcc_options values from argv.

  Args:
    argv: A list of strings, possibly the argv passed to main().

  Returns:
    The string that can be passed directly to nvcc.
  """

    parser = ArgumentParser()
    parser.add_argument('-nvcc_options', nargs='*', action='append')

    args, _ = parser.parse_known_args(argv)

    if args.nvcc_options:
        options = _update_options(sum(args.nvcc_options, []))
        return ' '.join(['--' + a for a in options])
    return ''


def system(cmd):
    """Invokes cmd with os.system().

  Args:
    cmd: The command.

  Returns:
    The exit code if the process exited with exit() or -signal
    if the process was terminated by a signal.
  """
    retv = os.system(cmd)
    if os.WIFEXITED(retv):
        return os.WEXITSTATUS(retv)
    else:
        return -os.WTERMSIG(retv)


def InvokeNvcc(argv, log=False):
    """Call nvcc with arguments assembled from argv.

  Args:
    argv: A list of strings, possibly the argv passed to main().
    log: True if logging is requested.

  Returns:
    The return value of calling system('nvcc ' + args)
  """

    host_compiler_options = GetHostCompilerOptions(argv)
    nvcc_compiler_options = GetNvccOptions(argv)
    opt_option = GetOptionValue(argv, '-O')
    m_options = GetOptionValue(argv, '-m')
    m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
    include_options = GetOptionValue(argv, '-I')
    out_file = GetOptionValue(argv, '-o')
    depfiles = GetOptionValue(argv, '-MF')
    defines = GetOptionValue(argv, '-D')
    defines = ''.join([' -D' + define for define in defines])
    undefines = GetOptionValue(argv, '-U')
    undefines = ''.join([' -U' + define for define in undefines])
    std_options = GetOptionValue(argv, '-std')
    nvcc_allowed_std_options = ["c++03", "c++11", "c++14"]
    nvcc_std_map = {}
    if int(NVCC_VERSION.split('.')[0]) >= 11:
        nvcc_std_map["c++1z"] = "c++17"
        nvcc_allowed_std_options += ["c++17", "c++1z"]
    std_options = ''.join([
        ' -std=' + (nvcc_std_map[define] if define in nvcc_std_map else define)
        for define in std_options if define in nvcc_allowed_std_options
    ][-1:])
    fatbin_options = ''.join([
        ' --fatbin-options=' + option
        for option in GetOptionValue(argv, '-Xcuda-fatbinary')
    ])

    # The list of source files get passed after the -c option. I don't know of
    # any other reliable way to just get the list of source files to be compiled.
    src_files = GetOptionValue(argv, '-c')

    # Pass -w through from host to nvcc, but don't do anything fancier with
    # warnings-related flags, since they're not necessarily the same across
    # compilers.
    warning_options = ' -w' if '-w' in argv else ''

    if len(src_files) == 0:
        return 1
    if len(out_file) != 1:
        return 1

    opt = (' -O2' if
           (len(opt_option) > 0 and int(opt_option[0]) > 0) else ' -g')

    includes = (' -I ' + ' -I '.join(include_options)
                if len(include_options) > 0 else '')

    # Unfortunately, there are other options that have -c prefix too.
    # So allowing only those look like C/C++ files.
    src_files = [
        f for f in src_files
        if re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C|\.cu|\.cuh$', f)
    ]
    srcs = ' '.join(src_files)
    out = ' -o ' + out_file[0]

    nvccopts = '-D_FORCE_INLINES '
    capabilities_sm = set(GetOptionValue(argv, "--cuda-gpu-arch"))
    capabilities_compute = set(GetOptionValue(argv, '--cuda-include-ptx'))
    # When both "code=sm_xy" and "code=compute_xy" are requested for a single
    # arch, they can be combined using "code=xy,compute_xy" which avoids a
    # redundant PTX generation during compilation.
    capabilities_both = capabilities_sm.intersection(capabilities_compute)
    for capability in capabilities_both:
        capability = capability[len('sm_'):]
        nvccopts += r'-gencode=arch=compute_%s,code=\"sm_%s,compute_%s\" ' % (
            capability, capability, capability)
    for capability in capabilities_sm - capabilities_both:
        capability = capability[len('sm_'):]
        nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s\" ' % (capability,
                                                                   capability)
    for capability in capabilities_compute - capabilities_both:
        capability = capability[len('sm_'):]
        nvccopts += r'-gencode=arch=compute_%s,\"code=compute_%s\" ' % (
            capability, capability)
    nvccopts += nvcc_compiler_options
    nvccopts += undefines
    nvccopts += defines
    nvccopts += std_options
    nvccopts += m_options
    nvccopts += warning_options
    # Force C++17 dialect (note, everything in just one string!)
    nvccopts += ' --std c++17 '
    nvccopts += fatbin_options

    if depfiles:
        # Generate the dependency file
        depfile = depfiles[0]
        cmd = (NVCC_PATH + ' ' + nvccopts + ' --compiler-options "' +
               host_compiler_options + '"' + ' --compiler-bindir=' +
               GCC_HOST_COMPILER_PATH + ' -I .' + ' -x cu ' + opt + includes +
               ' ' + srcs + ' -M -o ' + depfile)
        if log:
            Log(cmd)
        exit_status = system(cmd)
        if exit_status != 0:
            return exit_status

    cmd = (NVCC_PATH + ' ' + nvccopts + ' --compiler-options "' +
           host_compiler_options + ' -fPIC"' + ' --compiler-bindir=' +
           GCC_HOST_COMPILER_PATH + ' -I .' + ' -x cu ' + opt + includes +
           ' -c ' + srcs + out)

    # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
    # Need to investigate and fix.
    cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
    if log:
        Log(cmd)
    return system(cmd)


def main():
    parser = ArgumentParser()
    parser.add_argument('-x', nargs=1)
    parser.add_argument('--cuda_log', action='store_true')
    args, leftover = parser.parse_known_args(sys.argv[1:])

    if args.x and args.x[0] == 'cuda':
        if args.cuda_log:
            Log('-x cuda')
        leftover = [pipes.quote(s) for s in leftover]
        if args.cuda_log:
            Log('using nvcc')
        return InvokeNvcc(leftover, log=args.cuda_log)

    # Strip our flags before passing through to the CPU compiler for files which
    # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
    # We not only want to pass -x to the CPU compiler, but also keep it in its
    # relative location in the argv list (the compiler is actually sensitive to
    # this).
    cpu_compiler_flags = [
        flag for flag in sys.argv[1:] if not flag.startswith(('--cuda_log'))
    ]

    return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)


if __name__ == '__main__':
    sys.exit(main())


================================================
FILE: build_deps/gpus/cuda/BUILD
================================================


================================================
FILE: build_deps/gpus/cuda/BUILD.tpl
================================================
load(":build_defs.bzl", "cuda_header_library")
load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
load("@bazel_skylib//lib:selects.bzl", "selects")
load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")

licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like

package(default_visibility = ["//visibility:public"])

bool_flag(
    name = "enable_cuda",
    build_setting_default = False,
)

config_setting(
    name = "is_cuda_enabled",
    flag_values = {":enable_cuda": "True"},
)


# Config setting whether built with CUDA support using nvcc.
#
# TODO(b/174244321), DEPRECATED: this target will be removed when all users
# have been converted to :is_cuda_enabled (most) or :is_cuda_compiler_nvcc.
selects.config_setting_group(
    name = "using_nvcc",
    match_all = [
        "//:is_cuda_enabled",
        "//:is_cuda_compiler_nvcc",
    ],
)

config_setting(
    name = "_opt",
    values = {"compilation_mode": "opt"},
    visibility = ["//visibility:private"],
)

# Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
# All clients including TensorFlow should use these directives.
cuda_header_library(
    name = "cuda_headers",
    hdrs = [
        "cuda/cuda_config.h",
        ":cuda-include",
    ],
    include_prefix = "third_party/gpus",
    includes = [
        ".",  # required to include cuda/cuda/cuda_config.h as cuda/config.h
        "cuda/include",
    ],
)

cc_library(
    name = "cudart_static",
    srcs = ["cuda/lib/%{cudart_static_lib}"],
    linkopts = [
        "-ldl",
        "-lpthread",
        %{cudart_static_linkopt}
    ],
)

cc_library(
    name = "cuda_driver",
    srcs = ["cuda/lib/%{cuda_driver_lib}"],
)

cc_library(
    name = "cudart",
    srcs = ["cuda/lib/%{cudart_lib}"],
    data = ["cuda/lib/%{cudart_lib}"],
    linkstatic = 1,
)

cuda_header_library(
    name = "cublas_headers",
    hdrs = [":cublas-include"],
    include_prefix = "third_party/gpus/cuda/include",
    includes = ["cublas/include"],
    strip_include_prefix = "cublas/include",
    deps = [":cuda_headers"],
)

cuda_header_library(
    name = "cusolver_headers",
    hdrs = [":cusolver-include"],
    include_prefix = "third_party/gpus/cuda/include",
    includes = ["cusolver/include"],
    strip_include_prefix = "cusolver/include",
    deps = [":cuda_headers"],
)

cuda_header_library(
    name = "cufft_headers",
    hdrs = [":cufft-include"],
    include_prefix = "third_party/gpus/cuda/include",
    includes = ["cufft/include"],
    strip_include_prefix = "cufft/include",
    deps = [":cuda_headers"],
)

cuda_header_library(
    name = "cusparse_headers",
    hdrs = [":cusparse-include"],
    include_prefix = "third_party/gpus/cuda/include",
    includes = ["cusparse/include"],
    strip_include_prefix = "cusparse/include",
    deps = [":cuda_headers"],
)

cuda_header_library(
    name = "curand_headers",
    hdrs = [":curand-include"],
    include_prefix = "third_party/gpus/cuda/include",
    includes = ["curand/include"],
    strip_include_prefix = "curand/include",
    deps = [":cuda_headers"],
)

cc_library(
    name = "cublas",
    srcs = ["cuda/lib/%{cublas_lib}"],
    data = ["cuda/lib/%{cublas_lib}"],
    linkstatic = 1,
)

cc_library(
    name = "cublasLt",
    srcs = ["cuda/lib/%{cublasLt_lib}"],
    data = ["cuda/lib/%{cublasLt_lib}"],
    linkstatic = 1,
)

cc_library(
    name = "cusolver",
    srcs = ["cuda/lib/%{cusolver_lib}"],
    data = ["cuda/lib/%{cusolver_lib}"],
    linkopts = ["-lgomp"],
    linkstatic = 1,
)

cc_library(
    name = "cudnn",
    srcs = ["cuda/lib/%{cudnn_lib}"],
    data = ["cuda/lib/%{cudnn_lib}"],
    linkstatic = 1,
)

cc_library(
    name = "cudnn_header",
    hdrs = [":cudnn-include"],
    include_prefix = "third_party/gpus/cudnn",
    strip_include_prefix = "cudnn/include",
    deps = [":cuda_headers"],
)

cc_library(
    name = "cufft",
    srcs = ["cuda/lib/%{cufft_lib}"],
    data = ["cuda/lib/%{cufft_lib}"],
    linkstatic = 1,
)

cc_library(
    name = "curand",
    srcs = ["cuda/lib/%{curand_lib}"],
    data = ["cuda/lib/%{curand_lib}"],
    linkstatic = 1,
)

cc_library(
    name = "cuda",
    deps = [
        ":cublas",
        ":cublasLt",
        ":cuda_headers",
        ":cudart",
        ":cudnn",
        ":cufft",
        ":curand",
    ],
)

alias(
    name = "cub_headers",
    actual = "%{cub_actual}",
)

cuda_header_library(
    name = "cupti_headers",
    hdrs = [":cuda-extras"],
    include_prefix = "third_party/gpus",
    includes = ["cuda/extras/CUPTI/include/"],
    deps = [":cuda_headers"],
)

cc_library(
    name = "cupti_dsos",
    data = ["cuda/lib/%{cupti_lib}"],
)

cc_library(
    name = "cusparse",
    srcs = ["cuda/lib/%{cusparse_lib}"],
    data = ["cuda/lib/%{cusparse_lib}"],
    linkopts = ["-lgomp"],
    linkstatic = 1,
)

cc_library(
    name = "libdevice_root",
    data = [":cuda-nvvm"],
)

bzl_library(
    name = "build_defs_bzl",
    srcs = ["build_defs.bzl"],
    deps = [
        "@bazel_skylib//lib:selects",
    ],
)

py_library(
    name = "cuda_config_py",
    srcs = ["cuda/cuda_config.py"],
)

%{copy_rules}


================================================
FILE: build_deps/gpus/cuda/build_defs.bzl.tpl
================================================
# Macros for building CUDA code.
def cuda_default_copts():
    """Default options for all CUDA compilations."""
    return [
        "-x",
        "cuda",
        "-DUSE_CUDA=1",
        "-Xcuda-fatbinary=--compress-all",
    ] + %{cuda_extra_copts}


def cuda_gpu_architectures():
    """Returns a list of supported GPU architectures."""
    return %{cuda_gpu_architectures}


def cuda_header_library(name,
                        hdrs,
                        include_prefix=None,
                        strip_include_prefix=None,
                        deps=[],
                        **kwargs):
    """Generates a cc_library containing both virtual and system include paths.

    Generates both a header-only target with virtual includes plus the full
    target without virtual includes. This works around the fact that bazel can't
    mix 'includes' and 'include_prefix' in the same target."""

    native.cc_library(
        name=name + "_virtual",
        hdrs=hdrs,
        include_prefix=include_prefix,
        strip_include_prefix=strip_include_prefix,
        deps=deps,
        visibility=["//visibility:private"],
    )

    native.cc_library(name=name,
                      textual_hdrs=hdrs,
                      deps=deps + [":%s_virtual" % name],
                      **kwargs)


def cuda_cc_library(copts=[], **kwargs):
    """Wrapper over cc_library which adds default CUDA options."""
    native.cc_library(copts=cuda_default_copts() + copts, **kwargs)


def cuda_cc_binary(copts=[], **kwargs):
    """Wrapper over cc_library which adds default CUDA options."""
    native.cc_binary(copts=cuda_default_copts() + copts, **kwargs)


def cuda_cc_test(copts=[], **kwargs):
    """Wrapper over cc_test which adds default CUDA options."""
    native.cc_test(copts=copts, **kwargs)


================================================
FILE: build_deps/gpus/cuda/cuda_config.h.tpl
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef CUDA_CUDA_CONFIG_H_
#define CUDA_CUDA_CONFIG_H_

#define CUDA_VERSION "%{cuda_version}"
#define CUDART_VERSION "%{cudart_version}"
#define CUPTI_VERSION "%{cupti_version}"
#define CUBLAS_VERSION "%{cublas_version}"
#define CUSOLVER_VERSION "%{cusolver_version}"
#define CURAND_VERSION "%{curand_version}"
#define CUFFT_VERSION "%{cufft_version}"
#define CUSPARSE_VERSION "%{cusparse_version}"
#define CUDNN_VERSION "%{cudnn_version}"

#define CUDA_TOOLKIT_PATH "%{cuda_toolkit_path}"

#define CUDA_COMPUTE_CAPABILITIES %{cuda_compute_capabilities}

#endif  // CUDA_CUDA_CONFIG_H_


================================================
FILE: build_deps/gpus/cuda/cuda_config.py.tpl
================================================
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

config = %{cuda_config}


================================================
FILE: build_deps/gpus/find_cuda_config.py
================================================
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Prints CUDA library and header directories and versions found on the system.

The script searches for CUDA library and header files on the system, inspects
them to determine their version and prints the configuration to stdout.
The paths to inspect and the required versions are specified through environment
variables. If no valid configuration is found, the script prints to stderr and
returns an error code.

The list of libraries to find is specified as arguments. Supported libraries are
CUDA (includes cuBLAS), cuDNN, NCCL, and TensorRT.

The script takes a list of base directories specified by the CUDA_PATHS
environment variable as comma-separated glob list. The script looks for headers
and library files in a hard-coded set of subdirectories from these base paths.
If CUDA_PATHS is not specified, a OS specific default is used:

  Linux:   /usr/local/cuda, /usr, and paths from 'ldconfig -p'.
  Windows: CUDA_PATH environment variable, or
           C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\*

For backwards compatibility, some libraries also use alternative base
directories from other environment variables if they are specified. List of
library-specific environment variables:

  Library   Version env variable  Additional base directories
  ----------------------------------------------------------------
  CUDA      CUDA_VERSION       CUDA_TOOLKIT_PATH
  cuBLAS    CUBLAS_VERSION     CUDA_TOOLKIT_PATH
  cuDNN     CUDNN_VERSION      CUDNN_INSTALL_PATH
  NCCL      NCCL_VERSION       NCCL_INSTALL_PATH, NCCL_HDR_PATH
  TensorRT  TENSORRT_VERSION   TENSORRT_INSTALL_PATH

Versions environment variables can be of the form 'x' or 'x.y' to request a
specific version, empty or unspecified to accept any version.

The output of a found library is of the form:
tf_<library>_version: x.y.z
tf_<library>_header_dir: ...
tf_<library>_library_dir: ...
"""

import glob
import io
import os
import platform
import re
import subprocess
import sys

# pylint: disable=g-import-not-at-top
try:
    from shutil import which
except ImportError:
    from distutils.spawn import find_executable as which
# pylint: enable=g-import-not-at-top


class ConfigError(Exception):
    pass


def _is_linux():
    return platform.system() == "Linux"


def _is_macos():
    return platform.system() == "Darwin"


def _matches_version(actual_version, required_version):
    """Checks whether some version meets the requirements.

      All elements of the required_version need to be present in the
      actual_version.

          required_version  actual_version  result
          -----------------------------------------
          1                 1.1             True
          1.2               1               False
          1.2               1.3             False
                            1               True

      Args:
        required_version: The version specified by the user.
        actual_version: The version detected from the CUDA installation.
      Returns: Whether the actual version matches the required one.
  """
    if actual_version is None:
        return False

    # Strip spaces from the versions.
    actual_version = actual_version.strip()
    required_version = required_version.strip()
    return actual_version.startswith(required_version)


def _at_least_version(actual_version, required_version):
    actual = [int(v) for v in actual_version.split(".")]
    required = [int(v) for v in required_version.split(".")]
    return actual >= required


def _get_header_version(path, name):
    """Returns preprocessor defines in C header file."""
    for line in io.open(path, "r", encoding="utf-8").readlines():
        match = re.match("\s*#\s*define %s\s+(\d+)" % name, line)
        if match:
            return match.group(1)
    return ""


def _cartesian_product(first, second):
    """Returns all path combinations of first and second."""
    return [os.path.join(f, s) for f in first for s in second]


def _get_ld_config_paths():
    """Returns all directories from 'ldconfig -p'."""
    if not _is_linux():
        return []
    ldconfig_path = which("ldconfig") or "/sbin/ldconfig"
    output = subprocess.check_output([ldconfig_path, "-p"])
    pattern = re.compile(".* => (.*)")
    result = set()
    for line in output.splitlines():
        try:
            match = pattern.match(line.decode("ascii"))
        except UnicodeDecodeError:
            match = False
        if match:
            result.add(os.path.dirname(match.group(1)))
    return sorted(list(result))


def _get_default_cuda_paths(cuda_version):
    if not cuda_version:
        cuda_version = "*"
    elif not "." in cuda_version:
        cuda_version = cuda_version + ".*"

    return [
        "/usr/local/cuda-%s" % cuda_version, "/usr/local/cuda", "/usr",
        "/usr/local/cudnn"
    ] + _get_ld_config_paths()


def _header_paths():
    """Returns hard-coded set of relative paths to look for header files."""
    return [
        "",
        "include",
        "include/cuda",
        "include/*-linux-gnu",
        "extras/CUPTI/include",
        "include/cuda/CUPTI",
        "local/cuda/extras/CUPTI/include",
    ]


def _library_paths():
    """Returns hard-coded set of relative paths to look for library files."""
    return [
        "",
        "lib64",
        "lib",
        "lib/*-linux-gnu",
        "lib/x64",
        "extras/CUPTI/*",
        "local/cuda/lib64",
        "local/cuda/extras/CUPTI/lib64",
    ]


def _not_found_error(base_paths, relative_paths, filepattern):
    base_paths = "".join(
        ["\n        '%s'" % path for path in sorted(base_paths)])
    relative_paths = "".join(
        ["\n        '%s'" % path for path in relative_paths])
    return ConfigError(
        "Could not find any %s in any subdirectory:%s\nof:%s\n" %
        (filepattern, relative_paths, base_paths))


def _find_file(base_paths, relative_paths, filepattern):
    for path in _cartesian_product(base_paths, relative_paths):
        for file in glob.glob(os.path.join(path, filepattern)):
            return file
    raise _not_found_error(base_paths, relative_paths, filepattern)


def _find_library(base_paths, library_name, required_version):
    """Returns first valid path to the requested library."""
    if _is_macos():
        filepattern = "%s*.dylib" % (".".join(["lib" + library_name] +
                                              required_version.split(".")[:1]))
    else:
        filepattern = ".".join(["lib" + library_name, "so"] +
                               required_version.split(".")[:1]) + "*"
    return _find_file(base_paths, _library_paths(), filepattern)


def _find_versioned_file(base_paths, relative_paths, filepatterns,
                         required_version, get_version):
    """Returns first valid path to a file that matches the requested version."""
    if type(filepatterns) not in [list, tuple]:
        filepatterns = [filepatterns]
    for path in _cartesian_product(base_paths, relative_paths):
        for filepattern in filepatterns:
            for file in glob.glob(os.path.join(path, filepattern)):
                actual_version = get_version(file)
                if _matches_version(actual_version, required_version):
                    return file, actual_version
    raise _not_found_error(
        base_paths, relative_paths,
        ", ".join(filepatterns) + " matching version '%s'" % required_version)


def _find_header(base_paths, header_name, required_version, get_version):
    """Returns first valid path to a header that matches the requested version."""
    return _find_versioned_file(base_paths, _header_paths(), header_name,
                                required_version, get_version)


def _find_cuda_config(base_paths, required_version):

    def get_header_version(path):
        version = int(_get_header_version(path, "CUDA_VERSION"))
        if not version:
            return None
        return "%d.%d" % (version // 1000, version % 1000 // 10)

    cuda_header_path, header_version = _find_header(base_paths, "cuda.h",
                                                    required_version,
                                                    get_header_version)
    cuda_version = header_version  # x.y, see above.

    cuda_library_path = _find_library(base_paths, "cudart", cuda_version)

    def get_nvcc_version(path):
        pattern = "Cuda compilation tools, release \d+\.\d+, V(\d+\.\d+\.\d+)"
        for line in subprocess.check_output([path, "--version"]).splitlines():
            match = re.match(pattern, line.decode("ascii"))
            if match:
                return match.group(1)
        return None

    nvcc_name = "nvcc"
    nvcc_path, nvcc_version = _find_versioned_file(base_paths, [
        "",
        "bin",
        "local/cuda/bin",
    ], nvcc_name, cuda_version, get_nvcc_version)

    nvvm_path = _find_file(base_paths, [
        "nvvm/libdevice",
        "share/cuda",
        "lib/nvidia-cuda-toolkit/libdevice",
        "local/cuda/nvvm/libdevice",
    ], "libdevice*.10.bc")

    cupti_header_path = _find_file(base_paths, _header_paths(), "cupti.h")
    cupti_library_path = _find_library(base_paths, "cupti", required_version)

    cuda_binary_dir = os.path.dirname(nvcc_path)
    nvvm_library_dir = os.path.dirname(nvvm_path)

    # XLA requires the toolkit path to find ptxas and libdevice.
    # TODO(csigg): pass in both directories instead.
    cuda_toolkit_paths = (
        os.path.normpath(os.path.join(cuda_binary_dir, "..")),
        os.path.normpath(os.path.join(nvvm_library_dir, "../..")),
    )
    if cuda_toolkit_paths[0] != cuda_toolkit_paths[1]:
        raise ConfigError("Inconsistent CUDA toolkit path: %s vs %s" %
                          cuda_toolkit_paths)

    return {
        "cuda_version": cuda_version,
        "cuda_include_dir": os.path.dirname(cuda_header_path),
        "cuda_library_dir": os.path.dirname(cuda_library_path),
        "cuda_binary_dir": cuda_binary_dir,
        "nvvm_library_dir": nvvm_library_dir,
        "cupti_include_dir": os.path.dirname(cupti_header_path),
        "cupti_library_dir": os.path.dirname(cupti_library_path),
        "cuda_toolkit_path": cuda_toolkit_paths[0],
    }


def _find_cublas_config(base_paths, required_version, cuda_version):

    if _at_least_version(cuda_version, "10.1"):

        def get_header_version(path):
            version = (v for v in (_get_header_version(path, name)
                                   for name in ("CUBLAS_VER_MAJOR",
                                                "CUBLAS_VER_MINOR",
                                                "CUBLAS_VER_PATCH",
                                                "CUBLAS_VER_BUILD")) if v != "")
            return ".".join(version)

        header_path, header_version = _find_header(base_paths, "cublas_api.h",
                                                   required_version,
                                                   get_header_version)
        # cuBLAS uses the major version only.
        cublas_version = header_version.split(".")[0]

    else:
        # There is no version info available before CUDA 10.1, just find the file.
        header_version = cuda_version
        header_path = _find_file(base_paths, _header_paths(), "cublas_api.h")
        # cuBLAS version is the same as CUDA version (x.y).
        cublas_version = required_version

    library_path = _find_library(base_paths, "cublas", cublas_version)

    return {
        "cublas_version": header_version,
        "cublas_include_dir": os.path.dirname(header_path),
        "cublas_library_dir": os.path.dirname(library_path),
    }


def _find_cusolver_config(base_paths, required_version, cuda_version):

    if _at_least_version(cuda_version, "11.0"):

        def get_header_version(path):
            version = (v for v in (_get_header_version(path, name)
                                   for name in ("CUSOLVER_VER_MAJOR",
                                                "CUSOLVER_VER_MINOR",
                                                "CUSOLVER_VER_PATCH",
                                                "CUSOLVER_VER_BUILD")) if v != "")
            return ".".join(version)

        header_path, header_version = _find_header(base_paths,
                                                   "cusolver_common.h",
                                                   required_version,
                                                   get_header_version)
        cusolver_version = header_version.split(".")[0]

    else:
        header_version = cuda_version
        header_path = _find_file(base_paths, _header_paths(),
                                 "cusolver_common.h")
        cusolver_version = required_version

    library_path = _find_library(base_paths, "cusolver", cusolver_version)

    return {
        "cusolver_version": header_version,
        "cusolver_include_dir": os.path.dirname(header_path),
        "cusolver_library_dir": os.path.dirname(library_path),
    }


def _find_curand_config(base_paths, required_version, cuda_version):

    if _at_least_version(cuda_version, "11.0"):

        def get_header_version(path):
            version = (v for v in (_get_header_version(path, name)
                                   for name in ("CURAND_VER_MAJOR",
                                                "CURAND_VER_MINOR",
                                                "CURAND_VER_PATCH",
                                                "CURAND_VER_BUILD")) if v != "")
            return ".".join(version)

        header_path, header_version = _find_header(base_paths, "curand.h",
                                                   required_version,
                                                   get_header_version)
        curand_version = header_version.split(".")[0]

    else:
        header_version = cuda_version
        header_path = _find_file(base_paths, _header_paths(), "curand.h")
        curand_version = required_version

    library_path = _find_library(base_paths, "curand", curand_version)

    return {
        "curand_version": header_version,
        "curand_include_dir": os.path.dirname(header_path),
        "curand_library_dir": os.path.dirname(library_path),
    }


def _find_cufft_config(base_paths, required_version, cuda_version):

    if _at_least_version(cuda_version, "11.0"):

        def get_header_version(path):
            version = (v for v in (_get_header_version(path, name)
                                   for name in ("CUFFT_VER_MAJOR",
                                                "CUFFT_VER_MINOR",
                                                "CUFFT_VER_PATCH",
                                                "CUFFT_VER_BUILD")) if v != "")
            return ".".join(version)

        header_path, header_version = _find_header(base_paths, "cufft.h",
                                                   required_version,
                                                   get_header_version)
        cufft_version = header_version.split(".")[0]

    else:
        header_version = cuda_version
        header_path = _find_file(base_paths, _header_paths(), "cufft.h")
        cufft_version = required_version

    library_path = _find_library(base_paths, "cufft", cufft_version)

    return {
        "cufft_version": header_version,
        "cufft_include_dir": os.path.dirname(header_path),
        "cufft_library_dir": os.path.dirname(library_path),
    }


def _find_cudnn_config(base_paths, required_version):

    def get_header_version(path):
        version = [
            _get_header_version(path, name)
            for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL")
        ]
        return ".".join(version) if version[0] else None

    header_path, header_version = _find_header(base_paths,
                                               ("cudnn.h", "cudnn_version.h"),
                                               required_version,
                                               get_header_version)
    cudnn_version = header_version.split(".")[0]

    library_path = _find_library(base_paths, "cudnn", cudnn_version)

    return {
        "cudnn_version": cudnn_version,
        "cudnn_include_dir": os.path.dirname(header_path),
        "cudnn_library_dir": os.path.dirname(library_path),
    }


def _find_cusparse_config(base_paths, required_version, cuda_version):

    if _at_least_version(cuda_version, "11.0"):

        def get_header_version(path):
            version = (v for v in (_get_header_version(path, name)
                                   for name in ("CUSPARSE_VER_MAJOR",
                                                "CUSPARSE_VER_MINOR",
                                                "CUSPARSE_VER_PATCH",
                                                "CUSPARSE_VER_BUILD")) if v != "")
            return ".".join(version)

        header_path, header_version = _find_header(base_paths, "cusparse.h",
                                                   required_version,
                                                   get_header_version)
        cusparse_version = header_version.split(".")[0]

    else:
        header_version = cuda_version
        header_path = _find_file(base_paths, _header_paths(), "cusparse.h")
        cusparse_version = required_version

    library_path = _find_library(base_paths, "cusparse", cusparse_version)

    return {
        "cusparse_version": header_version,
        "cusparse_include_dir": os.path.dirname(header_path),
        "cusparse_library_dir": os.path.dirname(library_path),
    }


def _find_nccl_config(base_paths, required_version):

    def get_header_version(path):
        version = (_get_header_version(path, name)
                   for name in ("NCCL_MAJOR", "NCCL_MINOR", "NCCL_PATCH"))
        return ".".join(version)

    header_path, header_version = _find_header(base_paths, "nccl.h",
                                               required_version,
                                               get_header_version)
    nccl_version = header_version.split(".")[0]

    library_path = _find_library(base_paths, "nccl", nccl_version)

    return {
        "nccl_version": nccl_version,
        "nccl_include_dir": os.path.dirname(header_path),
        "nccl_library_dir": os.path.dirname(library_path),
    }


def _find_tensorrt_config(base_paths, required_version):

    def get_header_version(path):
        version = (_get_header_version(path, name)
                   for name in ("NV_TENSORRT_MAJOR", "NV_TENSORRT_MINOR",
                                "NV_TENSORRT_PATCH"))
        # `version` is a generator object, so we convert it to a list before using
        # it (muitiple times below).
        version = list(version)
        if not all(version):
            # Versions not found, make _matches_version returns False.
            return None
        return ".".join(version)

    header_path, header_version = _find_header(base_paths, "NvInferVersion.h",
                                               required_version,
                                               get_header_version)

    tensorrt_version = header_version.split(".")[0]
    library_path = _find_library(base_paths, "nvinfer", tensorrt_version)

    return {
        "tensorrt_version": tensorrt_version,
        "tensorrt_include_dir": os.path.dirname(header_path),
        "tensorrt_library_dir": os.path.dirname(library_path),
    }


def _list_from_env(env_name, default=[]):
    """Returns comma-separated list from environment variable."""
    if env_name in os.environ:
        return os.environ[env_name].split(",")
    return default


def _get_legacy_path(env_name, default=[]):
    """Returns a path specified by a legacy environment variable.

  CUDNN_INSTALL_PATH, NCCL_INSTALL_PATH, TENSORRT_INSTALL_PATH set to
  '/usr/lib/x86_64-linux-gnu' would previously find both library and header
  paths. Detect those and return '/usr', otherwise forward to _list_from_env().
  """
    if env_name in os.environ:
        match = re.match("^(/[^/ ]*)+/lib/\w+-linux-gnu/?$",
                         os.environ[env_name])
        if match:
            return [match.group(1)]
    return _list_from_env(env_name, default)


def _normalize_path(path):
    """Returns normalized path, with forward slashes on Windows."""
    return os.path.realpath(path)


def find_cuda_config():
    """Returns a dictionary of CUDA library and header file paths."""
    libraries = [argv.lower() for argv in sys.argv[1:]]
    cuda_version = os.environ.get("CUDA_VERSION", "")
    base_paths = _list_from_env("CUDA_PATHS",
                                _get_default_cuda_paths(cuda_version))
    base_paths = [path for path in base_paths if os.path.exists(path)]

    result = {}
    if "cuda" in libraries:
        cuda_paths = _list_from_env("CUDA_TOOLKIT_PATH", base_paths)
        result.update(_find_cuda_config(cuda_paths, cuda_version))

        cuda_version = result["cuda_version"]
        cublas_paths = base_paths
        if tuple(int(v) for v in cuda_version.split(".")) < (10, 1):
            # Before CUDA 10.1, cuBLAS was in the same directory as the toolkit.
            cublas_paths = cuda_paths
        cublas_version = os.environ.get("CUBLAS_VERSION", "")
        result.update(
            _find_cublas_config(cublas_paths, cublas_version, cuda_version))

        cusolver_paths = base_paths
        if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
            cusolver_paths = cuda_paths
        cusolver_version = os.environ.get("CUSOLVER_VERSION", "")
        result.update(
            _find_cusolver_config(cusolver_paths, cusolver_version,
                                  cuda_version))

        curand_paths = base_paths
        if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
            curand_paths = cuda_paths
        curand_version = os.environ.get("CURAND_VERSION", "")
        result.update(
            _find_curand_config(curand_paths, curand_version, cuda_version))

        cufft_paths = base_paths
        if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
            cufft_paths = cuda_paths
        cufft_version = os.environ.get("CUFFT_VERSION", "")
        result.update(
            _find_cufft_config(cufft_paths, cufft_version, cuda_version))

        cusparse_paths = base_paths
        if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
            cusparse_paths = cuda_paths
        cusparse_version = os.environ.get("CUSPARSE_VERSION", "")
        result.update(
            _find_cusparse_config(cusparse_paths, cusparse_version,
                                  cuda_version))

    if "cudnn" in libraries:
        cudnn_paths = _get_legacy_path("CUDNN_INSTALL_PATH", base_paths)
        cudnn_version = os.environ.get("CUDNN_VERSION", "")
        result.update(_find_cudnn_config(cudnn_paths, cudnn_version))

    if "nccl" in libraries:
        nccl_paths = _get_legacy_path("NCCL_INSTALL_PATH", base_paths)
        nccl_version = os.environ.get("NCCL_VERSION", "")
        result.update(_find_nccl_config(nccl_paths, nccl_version))

    if "tensorrt" in libraries:
        tensorrt_paths = _get_legacy_path("TENSORRT_INSTALL_PATH", base_paths)
        tensorrt_version = os.environ.get("TENSORRT_VERSION", "")
        result.update(_find_tensorrt_config(tensorrt_paths, tensorrt_version))

    for k, v in result.items():
        if k.endswith("_dir") or k.endswith("_path"):
            result[k] = _normalize_path(v)

    return result


def main():
    try:
        for key, value in sorted(find_cuda_config().items()):
            print("%s: %s" % (key, value))
    except ConfigError as e:
        sys.stderr.write(str(e) + '\n')
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: build_deps/remote_config/BUILD
================================================


================================================
FILE: build_deps/remote_config/BUILD.tpl
================================================
# Each platform creates a constraint @<platform>//:platform_constraint that
# is listed in its constraint_values; rule that want to select a specific
# platform to run on can put @<platform>//:platform_constraing into their
# exec_compatible_with attribute.
# Toolchains can similarly be marked with target_compatible_with or
# exec_compatible_with to bind them to this platform.
constraint_setting(
    name = "platform_setting"
)

constraint_value(
    name = "platform_constraint",
    constraint_setting = ":platform_setting",
    visibility = ["//visibility:public"],
)

platform(
    name = "platform",
    visibility = ["//visibility:public"],
    constraint_values = [
        "@platforms//cpu:%{cpu}",
        "@platforms//os:%{platform}",
        ":platform_constraint",
    ],
    exec_properties = %{exec_properties},
)


================================================
FILE: build_deps/remote_config/common.bzl
================================================
"""Functions common across configure rules."""

BAZEL_SH = "BAZEL_SH"
PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
PYTHON_CONFIG_REPO = "PYTHON_CONFIG_REPO"


def auto_config_fail(msg):
    """Output failure message when auto configuration fails."""
    red = "\033[0;31m"
    no_color = "\033[0m"
    fail("%sConfiguration Error:%s %s\n" % (red, no_color, msg))


def which(repository_ctx, program_name, allow_failure=False):
    """Returns the full path to a program on the execution platform.

    Args:
      repository_ctx: the repository_ctx
      program_name: name of the program on the PATH

    Returns:
      The full path to a program on the execution platform.
    """
    out = execute(
        repository_ctx,
        ["which", program_name],
        allow_failure=allow_failure,
    ).stdout
    if out != None:
        out = out.replace("\\", "\\\\").rstrip()
    return out


def get_python_bin(repository_ctx):
    """Gets the python bin path.

    Args:
      repository_ctx: the repository_ctx

    Returns:
      The python bin path.
    """
    python_bin = get_host_environ(repository_ctx, PYTHON_BIN_PATH)
    if python_bin:
        return python_bin

    # First check for an explicit "python3"
    python_bin = which(repository_ctx, "python3", True)
    if python_bin:
        return python_bin

    # Some systems just call pythone3 "python"
    python_bin = which(repository_ctx, "python", True)
    if python_bin:
        return python_bin

    auto_config_fail(
        "Cannot find python in PATH, please make sure " +
        "python is installed and add its directory in PATH, or --define " +
        "%s='/something/else'.\nPATH=%s" % (
            PYTHON_BIN_PATH,
            get_environ(repository_ctx, "PATH"),
        ))
    return python_bin  # unreachable


def get_bash_bin(repository_ctx):
    """Gets the bash bin path.

    Args:
      repository_ctx: the repository_ctx

    Returns:
      The bash bin path.
    """
    bash_bin = get_host_environ(repository_ctx, BAZEL_SH)
    if bash_bin != None:
        return bash_bin
    bash_bin_path = which(repository_ctx, "bash")
    if bash_bin_path == None:
        auto_config_fail(
            "Cannot find bash in PATH, please make sure " +
            "bash is installed and add its directory in PATH, or --define " +
            "%s='/path/to/bash'.\nPATH=%s" % (
                BAZEL_SH,
                get_environ(repository_ctx, "PATH"),
            ))
    return bash_bin_path


def read_dir(repository_ctx, src_dir):
    """Returns a sorted list with all files in a directory.

    Finds all files inside a directory, traversing subfolders and following
    symlinks.

    Args:
      repository_ctx: the repository_ctx
      src_dir: the directory to traverse

    Returns:
      A sorted list with all files in a directory.
    """
    find_result = execute(
        repository_ctx,
        ["find", src_dir, "-follow", "-type", "f"],
        allow_failure=True,
    )
    result = find_result.stdout
    return sorted(result.splitlines())


def get_environ(repository_ctx, name, default_value=None):
    """Returns the value of an environment variable on the execution platform.

    Args:
      repository_ctx: the repository_ctx
      name: the name of environment variable
      default_value: the value to return if not set

    Returns:
      The value of the environment variable 'name' on the execution platform
      or 'default_value' if it's not set.
    """
    cmd = "echo -n \"$%s\"" % name
    result = execute(
        repository_ctx,
        [get_bash_bin(repository_ctx), "-c", cmd],
        allow_failure=True,
    )
    if len(result.stdout) == 0:
        return default_value
    return result.stdout


def get_host_environ(repository_ctx, name, default_value=None):
    """Returns the value of an environment variable on the host platform.

    The host platform is the machine that Bazel runs on.

    Args:
      repository_ctx: the repository_ctx
      name: the name of environment variable

    Returns:
      The value of the environment variable 'name' on the host platform.
    """
    if name in repository_ctx.os.environ:
        return repository_ctx.os.environ.get(name).strip()

    if hasattr(repository_ctx.attr,
               "environ") and name in repository_ctx.attr.environ:
        return repository_ctx.attr.environ.get(name).strip()

    return default_value


def get_cpu_value(repository_ctx):
    """Returns the name of the host operating system.

    Args:
      repository_ctx: The repository context.
    Returns:
      A string containing the name of the host operating system.
    """
    result = raw_exec(repository_ctx, ["uname", "-s"])
    return result.stdout.strip()


def execute(repository_ctx,
            cmdline,
            error_msg=None,
            error_details=None,
            allow_failure=False):
    """Executes an arbitrary shell command.

    Args:
      repository_ctx: the repository_ctx object
      cmdline: list of strings, the command to execute
      error_msg: string, a summary of the error if the command fails
      error_details: string, details about the error or steps to fix it
      allow_failure: bool, if True, an empty stdout result or output to stderr
        is fine, otherwise either of these is an error
    Returns:
      The result of repository_ctx.execute(cmdline)
    """
    result = raw_exec(repository_ctx, cmdline)
    if (result.stderr or not result.stdout) and not allow_failure:
        fail(
            "\n".join([
                error_msg.strip()
                if error_msg else "Repository command failed",
                result.stderr.strip(),
                error_details if error_details else "",
            ]), )
    return result


def raw_exec(repository_ctx, cmdline):
    """Executes a command via repository_ctx.execute() and returns the result.

    This method is useful for debugging purposes. For example, to print all
    commands executed as well as their return code.

    Args:
      repository_ctx: the repository_ctx
      cmdline: the list of args

    Returns:
      The 'exec_result' of repository_ctx.execute().
    """
    return repository_ctx.execute(cmdline)


def files_exist(repository_ctx, paths, bash_bin=None):
    """Checks which files in paths exists.

    Args:
      repository_ctx: the repository_ctx
      paths: a list of paths
      bash_bin: path to the bash interpreter

    Returns:
      Returns a list of Bool. True means that the path at the
      same position in the paths list exists.
    """
    if bash_bin == None:
        bash_bin = get_bash_bin(repository_ctx)

    cmd_tpl = "[ -e \"%s\" ] && echo True || echo False"
    cmds = [cmd_tpl % path for path in paths]
    cmd = " ; ".join(cmds)

    stdout = execute(repository_ctx, [bash_bin, "-c", cmd]).stdout.strip()
    return [val == "True" for val in stdout.splitlines()]


def realpath(repository_ctx, path, bash_bin=None):
    """Returns the result of "realpath path".

    Args:
      repository_ctx: the repository_ctx
      path: a path on the file system
      bash_bin: path to the bash interpreter

    Returns:
      Returns the result of "realpath path"
    """
    if bash_bin == None:
        bash_bin = get_bash_bin(repository_ctx)

    return execute(repository_ctx,
                   [bash_bin, "-c", "realpath \"%s\"" % path]).stdout.strip()


def err_out(result):
    """Returns stderr if set, else stdout.

    This function is a workaround for a bug in RBE where stderr is returned as stdout. Instead
    of using result.stderr use err_out(result) instead.

    Args:
      result: the exec_result.

    Returns:
      The stderr if set, else stdout
    """
    if len(result.stderr) == 0:
        return result.stdout
    return result.stderr


def config_repo_label(config_repo, target):
    """Construct a label from config_repo and target.

    This function exists to ease the migration from preconfig to remote config. In preconfig
    the *_CONFIG_REPO environ variables are set to packages in the main repo while in
    remote config they will point to remote repositories.

    Args:
      config_repo: a remote repository or package.
      target: a target
    Returns:
      A label constructed from config_repo and target.
    """
    if config_repo.startswith("@") and not config_repo.find("//") > 0:
        # remote config is being used.
        return Label(config_repo + "//" + target)
    elif target.startswith(":"):
        return Label(config_repo + target)
    else:
        return Label(config_repo + "/" + target)


================================================
FILE: build_deps/remote_config/remote_platform_configure.bzl
================================================
"""Repository rule to create a platform for a docker image to be used with RBE."""


def _remote_platform_configure_impl(repository_ctx):
    platform = repository_ctx.attr.platform
    if platform == "local":
        os = repository_ctx.os.name.lower()
        if os.startswith("mac os"):
            platform = "osx"
        else:
            platform = "linux"

    cpu = "x86_64"
    machine_type = repository_ctx.execute(["bash", "-c",
                                           "echo $MACHTYPE"]).stdout
    if (machine_type.startswith("ppc") or machine_type.startswith("powerpc")):
        cpu = "ppc"
    elif machine_type.startswith("s390x"):
        cpu = "s390x"
    elif machine_type.startswith("aarch64"):
        cpu = "aarch64"
    elif machine_type.startswith("arm64"):
        cpu = "aarch64"
    elif machine_type.startswith("arm"):
        cpu = "arm"
    elif machine_type.startswith("mips64"):
        cpu = "mips64"
    elif machine_type.startswith("riscv64"):
        cpu = "riscv64"

    exec_properties = repository_ctx.attr.platform_exec_properties

    serialized_exec_properties = "{"
    for k, v in exec_properties.items():
        serialized_exec_properties += "\"%s\" : \"%s\"," % (k, v)
    serialized_exec_properties += "}"

    repository_ctx.template(
        "BUILD",
        Label("//remote_config:BUILD.tpl"),
        {
            "%{platform}": platform,
            "%{exec_properties}": serialized_exec_properties,
            "%{cpu}": cpu,
        },
    )


remote_platform_configure = repository_rule(
    implementation=_remote_platform_configure_impl,
    attrs={
        "platform_exec_properties": attr.string_dict(mandatory=True),
        "platform": attr.string(default="linux", values=["linux", "local"]),
    },
)


================================================
FILE: cmake/modules/ClangFormat.cmake
================================================
# Copyright Tomas Zeman 2018.
# Distributed under the Boost Software License, Version 1.0.
# (See accompanying file LICENSE_1_0.txt or copy at
# http://www.boost.org/LICENSE_1_0.txt)

function(clangformat_setup clangformat_srcs)

  if(NOT CLANGFORMAT_EXECUTABLE)
    set(CLANGFORMAT_EXECUTABLE clang-format)
  endif()

  if(NOT EXISTS ${CLANGFORMAT_EXECUTABLE})
    find_program(clangformat_executable_tmp ${CLANGFORMAT_EXECUTABLE})
    if(clangformat_executable_tmp)
      set(CLANGFORMAT_EXECUTABLE ${clangformat_executable_tmp})
      unset(clangformat_executable_tmp)
    else()
      message(FATAL_ERROR "ClangFormat: ${CLANGFORMAT_EXECUTABLE} not found! Aborting")
    endif()
  endif()

  foreach(clangformat_src ${clangformat_srcs})
    get_filename_component(clangformat_src ${clangformat_src} ABSOLUTE)
    list(APPEND clangformat_srcs_tmp ${clangformat_src})
  endforeach()
  set(clangformat_srcs "${clangformat_srcs_tmp}")
  unset(clangformat_srcs_tmp)

  add_custom_target(${PROJECT_NAME}_clangformat ALL
                    COMMAND ${CLANGFORMAT_EXECUTABLE}
                            -style=file
                            -i
                            ${clangformat_srcs}
                    COMMENT "Formating with ${CLANGFORMAT_EXECUTABLE} ...")

endfunction()


================================================
FILE: docs/Makefile
================================================
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS    ?=
SPHINXBUILD   ?= sphinx-build
SOURCEDIR     = source
BUILDDIR      = build

# Put it first so that "make" without argument is like "make help".
help:
	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile clean

clean:
	rm -rf source/api source/README.md source/CONTRIBUTING.md
	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(0)

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


================================================
FILE: docs/README.md
================================================
# Documentation

This folder contains the scripts necessary to build the documentation for HierarchicalKV.
You can view the generated [HierarchicalKV documentation](https://nvidia-merlin.github.io/HierarchicalKV/master/README.html).

## Contributing to Docs

Follow the instructions below to be able to build the docs.

1. Install required documentation tools and extensions:

```shell
sudo apt-get install doxygen
pip install -r docs/requirements-doc.txt
```

2. Build the documentation:

`make -C docs clean html`

The preceding command runs Sphinx in your shell and outputs to build/html/index.html.

The build process for HierarchicalKV is unique among the Merlin projects because it
uses Doxygen, Breathe, and Exhale to create API documentation from the C++ source.

## Preview the changes

View docs web page by opening the HTML in your browser.
Run the following command from the root of the repository:

```bash
python -m http.server 8000 --directory docs/build/html
```

Afterward, open a web browser and access `https://localhost:8000`.

Check that your edits formatted correctly and read well.

## Decisions

### Rebuild the documentation on GitHub Pages

The `.github/workflows/docs-sched-rebuild.yaml` file rebuilds the documentation
for the `master` branch and the six most recent tags.  The job runs daily,
but you can trigger it manually by going to the following URL and clicking
the *Run workflow* button.

<https://github.com/NVIDIA-Merlin/HierarchicalKV/actions/workflows/docs-sched-rebuild.yaml>

### Source management: README and index files

* To preserve Sphinx's expectation that all source files are child files and directories
  of the `docs/source` directory, other content, such as the `README.md` file is
  copied to the source directory. You can determine which directories and files are copied by
  viewing `docs/source/conf.py` and looking for the `copydirs_additional_dirs` list.
  Directories are specified relative to the Sphinx source directory, `docs/source`.

* One consequence of the preceding bullet is that any change to the original files,
  such as adding or removing a topic, requires a similar change to the `docs/source/toc.yaml`
  file.  Updating the `docs/source/toc.yaml` file is not automatic.

* Because the GitHub browsing expectation is that a `README.md` file is rendered when you
  browse a directory, when a directory is copied, the `README.md` file is renamed to
  `index.md` to meet the HTML web server expectation of locating an `index.html` file
  in a directory.

### Adding links

TIP: When adding a link to a method or any heading that has underscores in it, repeat
the underscores in the link even though they are converted to hyphens in the HTML.

Refer to the following examples:

* `../somefile.md#2heading-with-spaces-and_underscore_separated_words-too`
* `./otherfile.md#save_params_to_files-method`

#### Docs-to-docs links

There is no concern for the GitHub browsing experience for files in the `docs/source/` directory.
You can use a relative path for the link.  For example--both the `README.md` file and the
`CONTRIBUTING.md` file are copied to `docs/source`. Because they are are both in the same
directory, you could add a link to a heading in the `README.md` file like this:

```markdown
To build HierarchicalKV from scratch, refer to
[How to Build](./README.md#how-to-build) in the `README` file.
```

When Sphinx renders the link, the `.md` file suffix is replaced with `.html`.

#### Docs-to-repository links

Some files that we publish as docs, such as the `CONTRIBUTING.md` file, refer readers to files
that are not published as docs. For example, we currently do not publish the `STYLE_GUIDE.md`
file.

To refer a reader to the `STYLE_GUIDE.md`, a README, or program, state that the link is to
the repository:

```markdown
## Coding Style
Refer to the [Style Guide](http://github.com/NVIDIA-Merlin/HierarchicalKV/STYLE_GUIDE.md)
in the GitHub repository for more details.
```

The idea is to let a reader know that following the link&mdash;whether from an HTML docs page or
from browsing GitHub&mdash;results in viewing our repository on GitHub.


================================================
FILE: docs/make.bat
================================================
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
	echo.
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	echo.installed, then set the SPHINXBUILD environment variable to point
	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	echo.may add the Sphinx directory to PATH.
	echo.
	echo.If you don't have Sphinx installed, grab it from
	echo.http://sphinx-doc.org/
	exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%

:end
popd


================================================
FILE: docs/requirements-doc.txt
================================================
# packages necessary to run tests and push PRs
# assumes requirements for nvtabular logic are already installed

wheel

# docs
Sphinx<3.6
jinja2<3.1
markupsafe==2.0.1
sphinx-multiversion@git+https://github.com/mikemckiernan/sphinx-multiversion.git
sphinxcontrib-copydirs@git+https://github.com/mikemckiernan/sphinxcontrib-copydirs.git
sphinx-external-toc<0.4
sphinx_rtd_theme
natsort<8.2
myst-nb
markdown-it-py
linkify-it-py

# C++
exhale<0.4


================================================
FILE: docs/source/_static/.gitkeep
================================================


================================================
FILE: docs/source/_static/css/banner.css
================================================
.wy-nav-content {
    margin: 0;
    background: #fcfcfc;
    padding-top: 40px;
}

.wy-side-nav-search {
    display: block;
    width: 300px;
    padding: .809em;
    padding-top: 0.809em;
    margin-bottom: .809em;
    z-index: 200;
    background-color: #2980b9;
    text-align: center;
    color: #fcfcfc;
    padding-top: 40px;
}

div.banner {
    position: fixed;
    top: 10px;
    left: 20px;
    margin: 0;
    z-index: 1000;
    width: 1050px;
    text-align: center;
}

p.banner {
  border-radius: 4px;
  color: #004831;
  background: #76b900;
}

================================================
FILE: docs/source/_static/css/custom.css
================================================
dl.cpp > dt > span.pre { padding-right: 2px; }

/* dl.cpp > dt > a > span.pre { padding-right: 2px; } */

dl > dt > em > span.pre { padding-right: 0px; padding-left: 2px; }

dl > dt > code.sig-name > span.pre { padding-left: 2px; }

footer div p {
  font-size: 80%;
}

footer div p a {
  color: var(--small-font-color);
}

footer div p a:hover {
  color: var(--small-font-color);
}


================================================
FILE: docs/source/_templates/footer.html
================================================
{% extends '!footer.html' %}
{% block contentinfo %}
{{ super() }}
<p>
<a href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank">Privacy Policy</a> |
<a href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank">Manage My Privacy</a> |
<a href="https://www.nvidia.com/en-us/preferences/start/" target="_blank">Do Not Sell or Share My Data</a> |
<a href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank">Terms of Service</a> |
<a href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank">Accessibility</a> |
<a href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank">Corporate Policies</a> |
<a href="https://www.nvidia.com/en-us/product-security/" target="_blank">Product Security</a> |
<a href="https://www.nvidia.com/en-us/contact/" target="_blank">Contact</a>
</p>
{% endblock %}

================================================
FILE: docs/source/_templates/versions.html
================================================
{%- if current_version %}
<div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">
  <span class="rst-current-version" data-toggle="rst-current-version">
    <span class="fa fa-book"> Other Versions</span>
    v: {{ current_version.name }}
    <span class="fa fa-caret-down"></span>
  </span>
  <div class="rst-other-versions">
    {%- if versions.tags %}
    <dl>
      <dt>Tags</dt>
      {%- for item in versions.tags %}
      <dd><a href="{{ item.url }}">{{ item.name }}</a></dd>
      {%- endfor %}
    </dl>
    {%- endif %}
    {%- if versions.branches %}
    <dl>
      <dt>Branches</dt>
      {%- for item in versions.branches %}
      <dd><a href="{{ item.url }}">{{ item.name }}</a></dd>
      {%- endfor %}
    </dl>
    {%- endif %}
  </div>
</div>
{%- endif %}


================================================
FILE: docs/source/conf.py
================================================
"""
 Copyright (c) 2021, NVIDIA CORPORATION.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
"""

# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import subprocess
import sys

from datetime import datetime
from natsort import natsorted

sys.path.insert(0, os.path.abspath("../.."))

repodir = os.path.abspath(os.path.join(__file__, r"../../.."))
gitdir = os.path.join(repodir, r".git")

# -- Project information -----------------------------------------------------

year_range = "2022"
year_now = str(datetime.now().year)
if year_range != year_now:
    year_range = year_range + chr(8211) + year_now

project = 'Merlin Key-Value Storage'
copyright = year_range + ", NVIDIA"
author = 'NVIDIA'

# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
    "myst_nb",
    "sphinx_external_toc",
    "sphinx_rtd_theme",
    "sphinx.ext.autodoc",
    "sphinx.ext.autosummary",
    "sphinx.ext.coverage",
    "sphinx.ext.githubpages",
    "sphinx.ext.napoleon",
    "sphinx.ext.viewcode",
    "sphinx.ext.intersphinx",
    "sphinx_multiversion",
    "sphinxcontrib.copydirs",
    "breathe",
    "exhale",
]

# MyST configuration settings
external_toc_path = "toc.yaml"
myst_enable_extensions = [
    "deflist",
    "html_image",
    "linkify",
    "replacements",
    "tasklist",
    "dollarmath",
]
myst_linkify_fuzzy_links = False
myst_heading_anchors = 4
nb_execution_mode = "off"

# Some documents are RST and include `.. toctree::` directives.
suppress_warnings = ["etoc.toctree", "myst.header", "misc.highlighting_failure"]

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = []


# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages.  See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"
html_theme_options = {
    "navigation_depth": 2,
    "analytics_id": "G-NVJ1Y1YJHK",
}
html_show_sourcelink = False
html_show_sphinx = False

# Whitelist pattern for tags (set to None to ignore all tags)
# Determine if Sphinx is reading conf.py from the checked out
# repo (a Git repo) vs SMV reading conf.py from an archive of the repo
# at a commit (not a Git repo).
if os.path.exists(gitdir):
    tag_refs = (
        subprocess.check_output(["git", "tag", "-l", "v*"]).decode("utf-8").split()
    )
    tag_refs = natsorted(tag_refs)[-6:]
    smv_tag_whitelist = r"^(" + r"|".join(tag_refs) + r")$"
else:
    # SMV is reading conf.py from a Git archive of the repo at a specific commit.
    smv_tag_whitelist = r"^v.*$"

# Only include main branch for now
smv_branch_whitelist = "^master$"

smv_refs_override_suffix = "-docs"

html_sidebars = {"**": ["versions.html"]}
html_baseurl = "https://nvidia-merlin.github.io/HierarchicalKV/master"

html_static_path = [ '_static' ]
html_css_files = [ "css/custom.css", "css/banner.css" ]

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".

source_suffix = [".rst", ".md"]

breathe_projects = {
    "HierarchicalKV": "/tmp/doxygen/xml"
}
breathe_default_project = "HierarchicalKV"

exhale_args = {
    "containmentFolder": "./api",
    "rootFileName": "index.rst",
    "doxygenStripFromPath": "../../include",
    "rootFileTitle": "HierarchicalKV C++ API Documentation",
    "fullApiSubSectionTitle": "Complete HierarchicalKV API",
    "createTreeView": False,
    "exhaleExecutesDoxygen": True,
    "exhaleDoxygenStdin": """
      FILE_PATTERNS = *.h *.cuh
      RECURSIVE = NO
      EXTENSION_MAPPING = cuh=C++
      HIDE_UNDOC_CLASSES = YES
      HIDE_FRIEND_COMPOUNDS = YES
      SORT_MEMBERS_CTORS_1ST = YES
      SHOW_USED_FILES = NO
      SHOW_FILES = NO
      SHOW_NAMESPACES = NO
      INPUT = ../../include
      INPUT_ENCODING = UTF-8
      """,
}

copydirs_additional_dirs = [
    "../../CONTRIBUTING.md",
    "../../README.md",
]
copydirs_file_rename = {
    "README.md": "index.md",
}


================================================
FILE: docs/source/index.rst
================================================
Merlin Key-Value Storage
========================

Merlin Key-Value Storage is an open source library that provides hierarchical key-value storage using on-GPU high-bandwidth memory (HBM) and host RAM.

For more information, see the `Introduction <README.html>`_.

Related Resources
-----------------

Merlin Key-Value Storage GitHub Repository
  `<https://github.com/NVIDIA-Merlin/HierarchicalKV>`_

About Merlin
  Merlin is the overarching project that brings together the Merlin projects.
  See the `documentation <https://nvidia-merlin.github.io/Merlin/main/README.html>`_
  or the `repository <https://github.com/NVIDIA-Merlin/Merlin>`_ on GitHub.

Developer website for Merlin
  More information about Merlin is available at our developer website:
  `<https://developer.nvidia.com/nvidia-merlin>`_.


================================================
FILE: docs/source/toc.yaml
================================================
root: index
subtrees:
  - caption: Contents
    entries:
      - file: README.md
        title: Introduction
      - file: api/index.rst
        title: API Documentation
      - file: CONTRIBUTING.md
        title: Contributing to HierarchicalKV


# The multi-modal data example uses several notebooks to demonstrate how to use of multi-modal data (text and images)
# to provide movie recommendations based on the MovieLens 25M dataset.

# .. toctree::
#    :maxdepth: 1


================================================
FILE: include/BUILD
================================================
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_cc_library")

cuda_cc_library(
    name = "merlin_localfile",
    hdrs = [
        "merlin_localfile.hpp",
    ],
    visibility = [
        "//visibility:public",
    ],
    deps = [
        "//include/merlin",
        "@local_config_cuda//cuda",
    ],
)

cuda_cc_library(
    name = "merlin_hashtable",
    hdrs = [
        "merlin_hashtable.cuh",
    ],
    visibility = [
        "//visibility:public",
    ],
    deps = [
        "//include/merlin",
        "@local_config_cuda//cuda",
    ],
)


================================================
FILE: include/merlin/BUILD
================================================
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_cc_library")

cuda_cc_library(
    name = "types_and_utils",
    srcs = [
    ],
    hdrs = [
        "types.cuh",
        "utils.cuh",
    ],
    visibility = [
        "//visibility:public",
    ],
    deps = [
        "@local_config_cuda//cuda",
    ],
)

cuda_cc_library(
    name = "merlin",
    srcs = [
    ],
    hdrs = [
        "allocator.cuh",
        "array_kernels.cuh",
        "core_kernels.cuh",
        "debug.hpp",
        "flexible_buffer.cuh",
        "group_lock.cuh",
        "memory_pool.cuh",
        "optimizers.cuh",
    ],
    visibility = [
        "//visibility:public",
    ],
    deps = [
        "//include/merlin:types_and_utils",
        "//include/merlin/core_kernels",
        "@local_config_cuda//cuda",
    ],
)


================================================
FILE: include/merlin/allocator.cuh
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <stdlib.h>
#include <thrust/device_malloc_allocator.h>
#include "debug.hpp"
#include "utils.cuh"

namespace nv {
namespace merlin {

enum MemoryType {
  Device,   // HBM
  Pinned,   // Pinned Host Memory
  Host,     // Host Memory
  Managed,  // Pageable Host Memory(Not required)
};

/* This abstract class defines the allocator APIs required by HKV.
   Any of the customized allocators should inherit from it.
 */
class BaseAllocator {
 public:
  BaseAllocator(const BaseAllocator&) = delete;
  BaseAllocator(BaseAllocator&&) = delete;

  BaseAllocator& operator=(const BaseAllocator&) = delete;
  BaseAllocator& operator=(BaseAllocator&&) = delete;

  BaseAllocator() = default;
  virtual ~BaseAllocator() = default;

  virtual void alloc(const MemoryType type, void** ptr, size_t size,
                     unsigned int pinned_flags = cudaHostAllocDefault) = 0;

  virtual void alloc_async(const MemoryType type, void** ptr, size_t size,
                           cudaStream_t stream) = 0;

  virtual void free(const MemoryType type, void* ptr) = 0;

  virtual void free_async(const MemoryType type, void* ptr,
                          cudaStream_t stream) = 0;
};

class DefaultAllocator : public virtual BaseAllocator {
 public:
  DefaultAllocator() {};
  ~DefaultAllocator() override {};

  void alloc(const MemoryType type, void** ptr, size_t size,
             unsigned int pinned_flags = cudaHostAllocDefault) override {
    switch (type) {
      case MemoryType::Device:
        CUDA_CHECK(cudaMalloc(ptr, size));
        break;
      case MemoryType::Pinned:
        CUDA_CHECK(cudaMallocHost(ptr, size, pinned_flags));
        break;
      case MemoryType::Host:
        *ptr = std::malloc(size);
        break;
    }
    return;
  }

  void alloc_async(const MemoryType type, void** ptr, size_t size,
                   cudaStream_t stream) override {
    if (type == MemoryType::Device) {
      CUDA_CHECK(cudaMallocAsync(ptr, size, stream));
    } else {
      MERLIN_CHECK(false,
                   "[DefaultAllocator] alloc_async is only support for "
                   "MemoryType::Device!");
    }
    return;
  }

  void free(const MemoryType type, void* ptr) override {
    if (ptr == nullptr) {
      return;
    }
    switch (type) {
      case MemoryType::Pinned:
        CUDA_CHECK(cudaFreeHost(ptr));
        break;
      case MemoryType::Device:
        CUDA_CHECK(cudaFree(ptr));
        break;
      case MemoryType::Host:
        std::free(ptr);
        break;
    }
    return;
  }

  void free_async(const MemoryType type, void* ptr,
                  cudaStream_t stream) override {
    if (ptr == nullptr) {
      return;
    }

    if (type == MemoryType::Device) {
      CUDA_CHECK(cudaFreeAsync(ptr, stream));
    } else {
      MERLIN_CHECK(false,
                   "[DefaultAllocator] free_async is only support for "
                   "MemoryType::Device!");
    }
  }
};

template <typename T>
struct ThrustAllocator : thrust::device_malloc_allocator<T> {
 public:
  typedef thrust::device_malloc_allocator<T> super_t;
  typedef typename super_t::pointer pointer;
  typedef typename super_t::size_type size_type;

 public:
  pointer allocate(size_type n) {
    void* ptr = nullptr;
    MERLIN_CHECK(
        allocator_ != nullptr,
        "[ThrustAllocator] set_allocator should be called in advance!");
    allocator_->alloc(MemoryType::Device, &ptr, sizeof(T) * n);
    return pointer(reinterpret_cast<T*>(ptr));
  }

  void deallocate(pointer p, size_type n) {
    MERLIN_CHECK(
        allocator_ != nullptr,
        "[ThrustAllocator] set_allocator should be called in advance!");
    allocator_->free(MemoryType::Device, reinterpret_cast<void*>(p.get()));
  }

  void set_allocator(BaseAllocator* allocator) { allocator_ = allocator; }

 public:
  BaseAllocator* allocator_ = nullptr;
};

}  // namespace merlin
}  // namespace nv


================================================
FILE: include/merlin/array_kernels.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <cooperative_groups.h>
#include "cuda_runtime.h"
#include "thrust/device_vector.h"
#include "thrust/execution_policy.h"
#include "thrust/scan.h"
#include "types.cuh"
#include "utils.cuh"

namespace nv {
namespace merlin {

template <typename K>
__global__ void keys_not_empty(const K* keys, bool* masks, size_t n) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid < n) {
    masks[tid] = keys[tid] != EMPTY_KEY;
  }
}

template <typename Tidx, int TILE_SIZE = 8>
__global__ void gpu_cell_count(const bool* masks, Tidx* offsets, size_t n,
                               size_t* n_existed) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int rank = g.thread_rank();
  bool is_existed = false;
  if (tid < n) {
    if (masks[tid]) {
      is_existed = true;
    }
  }
  unsigned int vote = g.ballot(is_existed);
  int g_ones = __popc((int)vote);
  if (rank == 0 && tid < n) {
    offsets[tid / TILE_SIZE] = static_cast<Tidx>(g_ones);
    atomicAdd(static_cast<uint64_t*>(n_existed), static_cast<uint64_t>(g_ones));
  }
}

template <typename K, typename V, typename S, typename Tidx, int TILE_SIZE = 8>
__global__ void gpu_select_kvm_kernel(const bool* masks, size_t n,
                                      const Tidx* offsets, K* __restrict keys,
                                      V* __restrict values,
                                      S* __restrict scores, const size_t dim) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int rank = g.thread_rank();

  bool is_existed = false;
  if (tid < n) {
    if (masks[tid]) {
      is_existed = true;
    }
  }
  unsigned int vote = g.ballot(is_existed);
  unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE);
  K empty_key = (K)EMPTY_KEY;

  if (tid < n) {
    r_vote = r_vote >> (TILE_SIZE - rank - 1);
    if (masks[tid]) {
      int prefix_n = __popc(r_vote) - 1;
      Tidx bias = offsets[tid / TILE_SIZE] + static_cast<Tidx>(prefix_n);

      if (bias == tid) return;

      K target_key = 0;
      AtomicKey<K>* atomic_key = reinterpret_cast<AtomicKey<K>*>(keys) + bias;
      while (target_key != empty_key) {
        target_key = empty_key;
        atomic_key->compare_exchange_weak(target_key, keys[tid],
                                          cuda::std::memory_order_relaxed,
                                          cuda::std::memory_order_relaxed);
      }
      if (scores) scores[bias] = scores[tid];
      for (size_t j = 0; j < dim; j++) {
        values[dim * bias + j] = values[dim * tid + j];
      }
      atomic_key = reinterpret_cast<AtomicKey<K>*>(keys) + tid;
      atomic_key->store(empty_key, cuda::std::memory_order_relaxed);
    }
  }
}

template <typename K, typename V, typename S, typename Tidx, int TILE_SIZE = 8>
void gpu_boolean_mask(size_t grid_size, size_t block_size, const bool* masks,
                      size_t n, size_t* n_evicted, Tidx* offsets,
                      K* __restrict keys, V* __restrict values,
                      S* __restrict scores, size_t dim, cudaStream_t stream) {
  size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;
  gpu_cell_count<Tidx, TILE_SIZE>
      <<<grid_size, block_size, 0, stream>>>(masks, offsets, n, n_evicted);
#if THRUST_VERSION >= 101600
  auto policy = thrust::cuda::par_nosync.on(stream);
#else
  auto policy = thrust::cuda::par.on(stream);
#endif
  thrust::device_ptr<Tidx> d_src(offsets);
  thrust::device_ptr<Tidx> d_dest(offsets);
  thrust::exclusive_scan(policy, d_src, d_src + n_offsets, d_dest);
  gpu_select_kvm_kernel<K, V, S, Tidx, TILE_SIZE>
      <<<grid_size, block_size, 0, stream>>>(masks, n, offsets, keys, values,
                                             scores, dim);
}

}  // namespace merlin
}  // namespace nv


================================================
FILE: include/merlin/core_kernels/BUILD
================================================
load("@local_config_cuda//cuda:build_defs.bzl", "cuda_cc_library")

cuda_cc_library(
    name = "core_kernels",
    srcs = [],
    hdrs = glob([
        "**/*.cuh",
    ]),
    visibility = [
        "//visibility:public",
    ],
    deps = [
        "//include/merlin:types_and_utils",
        "@local_config_cuda//cuda",
    ],
)


================================================
FILE: include/merlin/core_kernels/accum_or_assign.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "kernel_utils.cuh"

namespace nv {
namespace merlin {

template <class V, uint32_t TILE_SIZE = 4>
__device__ __forceinline__ void accum_or_assign_vector(
    cg::thread_block_tile<TILE_SIZE> const& g, const V* delta_or_val, V* dst,
    const bool is_accum, const size_t dim) {
  for (auto i = g.thread_rank(); i < dim; i += g.size()) {
    if (is_accum) {
      dst[i] += delta_or_val[i];
    } else {
      dst[i] = delta_or_val[i];
    }
  }
}

/* Write the values of delta_or_val into the table. If the key[i] is already in
   the table indicted be @exists[i], a @delta_or_val[i] will be added to the the
   existing value. if the key not exists, the value @val_or_delta[i] will be
   assigned to the address @dst[i].

   `delta_or_val`: will be treated as val and accumlating should be executed.
   `dst`: A pointer of pointer to V which should be on HBM,
          but each value (a pointer of V) could point to a
          memory on HBM or HMEM.
   `existed`: If the keys existed before this kernel is executed.
   `status`: The existence status for each key when the kernel is being
   executed.

   `N`: number of vectors needed to be writen.
*/
template <class K, class V, class S>
__global__ void write_with_accum_kernel(const V* __restrict delta_or_val,
                                        V** __restrict dst,
                                        const bool* __restrict existed,
                                        const bool* __restrict status,
                                        const int* __restrict src_offset,
                                        const size_t dim, size_t N) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;

  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
    int vec_index = int(t / dim);
    int dim_index = t % dim;

    if (dst[vec_index] != nullptr &&
        existed[src_offset[vec_index]] == status[src_offset[vec_index]]) {
      if (status[src_offset[vec_index]]) {
        dst[vec_index][dim_index] +=
            delta_or_val[src_offset[vec_index] * dim + dim_index];
      } else {
        dst[vec_index][dim_index] =
            delta_or_val[src_offset[vec_index] * dim + dim_index];
      }
    }
  }
}

/*
 * update with IO operation. This kernel is
 * usually used for the pure HBM mode for better performance.
 */
template <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>
__global__ void accum_or_assign_kernel_with_io(
    const Table<K, V, S>* __restrict table, const size_t bucket_max_size,
    const size_t buckets_num, const size_t dim, const K* __restrict keys,
    const V* __restrict value_or_deltas, const S* __restrict scores,
    const bool* __restrict accum_or_assigns, const S global_epoch,
    const size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int* buckets_size = table->buckets_size;

  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_pos = -1;
    size_t key_idx = t / TILE_SIZE;

    const K insert_key = keys[key_idx];

    if (IS_RESERVED_KEY<K>(insert_key)) continue;

    const S insert_score =
        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);

    const V* insert_value = value_or_deltas + key_idx * dim;
    const bool is_accum = accum_or_assigns[key_idx];

    size_t bkt_idx = 0;
    size_t start_idx = 0;
    int src_lane = -1;
    K evicted_key;

    Bucket<K, V, S>* bucket =
        get_key_position<K>(table->buckets, insert_key, bkt_idx, start_idx,
                            buckets_num, bucket_max_size);

    OccupyResult occupy_result{OccupyResult::INITIAL};
    const int bucket_size = buckets_size[bkt_idx];
    do {
      if (bucket_size < bucket_max_size) {
        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(
            g, bucket, insert_key, insert_score, evicted_key, start_idx,
            key_pos, src_lane, bucket_max_size);
      } else {
        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
        occupy_result = find_and_lock_when_full<K, V, S, TILE_SIZE,
                                                ScoreFunctor::LOCK_MEM_ORDER,
                                                ScoreFunctor::UNLOCK_MEM_ORDER>(
            g, bucket, insert_key, insert_score, evicted_key, start_idx,
            key_pos, src_lane, bucket_max_size);
      }

      occupy_result = g.shfl(occupy_result, src_lane);
    } while (occupy_result == OccupyResult::CONTINUE);

    if (occupy_result == OccupyResult::REFUSED) continue;

    if ((is_accum && occupy_result != OccupyResult::DUPLICATE) ||
        (!is_accum && occupy_result == OccupyResult::DUPLICATE)) {
      if (g.thread_rank() == src_lane) {
        if (occupy_result == OccupyResult::OCCUPIED_EMPTY) {
          evicted_key = static_cast<K>(EMPTY_KEY);
        }
        if (occupy_result == OccupyResult::OCCUPIED_RECLAIMED) {
          evicted_key = static_cast<K>(RECLAIM_KEY);
        }
        if (occupy_result == OccupyResult::DUPLICATE) {
          evicted_key = insert_key;
        }
        (bucket->keys(key_pos))
            ->store(evicted_key, ScoreFunctor::UNLOCK_MEM_ORDER);
      }
      g.sync();
      continue;
    }
    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
        g.thread_rank() == src_lane) {
      atomicAdd(&(buckets_size[bkt_idx]), 1);
    }

    accum_or_assign_vector<V, TILE_SIZE>(
        g, insert_value, bucket->vectors + key_pos * dim, is_accum, dim);

    if (g.thread_rank() == src_lane) {
      ScoreFunctor::update(bucket, key_pos, scores, key_idx, insert_score,
                           (occupy_result != OccupyResult::DUPLICATE));
      bucket->digests(key_pos)[0] = get_digest<K>(insert_key);
      (bucket->keys(key_pos))
          ->store(insert_key, ScoreFunctor::UNLOCK_MEM_ORDER);
    }
  }
}

template <typename K, typename V, typename S, int Strategy>
struct SelectAccumOrAssignKernelWithIO {
  static void execute_kernel(
      const float& load_factor, const int& block_size,
      const size_t bucket_max_size, const size_t buckets_num, const size_t dim,
      cudaStream_t& stream, const size_t& n,
      const Table<K, V, S>* __restrict table, const K* __restrict keys,
      const V* __restrict value_or_deltas, const S* __restrict scores,
      const bool* __restrict accum_or_assigns, const S global_epoch) {
    if (load_factor <= 0.75) {
      const unsigned int tile_size = 4;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      accum_or_assign_kernel_with_io<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, bucket_max_size, buckets_num, dim, keys, value_or_deltas,
              scores, accum_or_assigns, global_epoch, N);
    } else {
      const unsigned int tile_size = 32;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      accum_or_assign_kernel_with_io<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, bucket_max_size, buckets_num, dim, keys, value_or_deltas,
              scores, accum_or_assigns, global_epoch, N);
    }
    return;
  }
};

template <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>
__global__ void accum_or_assign_kernel(
    const Table<K, V, S>* __restrict table, const size_t bucket_max_size,
    const size_t buckets_num, const size_t dim, const K* __restrict keys,
    V** __restrict value_or_deltas, const S* __restrict scores,
    const bool* __restrict accum_or_assigns, int* __restrict src_offset,
    bool* __restrict founds, const S global_epoch, size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int* buckets_size = table->buckets_size;

  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_pos = -1;
    size_t key_idx = t / TILE_SIZE;

    const K insert_key = keys[key_idx];

    if (IS_RESERVED_KEY<K>(insert_key)) continue;

    const S insert_score =
        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);

    const bool is_accum = accum_or_assigns[key_idx];

    size_t bkt_idx = 0;
    size_t start_idx = 0;
    int src_lane = -1;
    K evicted_key;

    Bucket<K, V, S>* bucket =
        get_key_position<K>(table->buckets, insert_key, bkt_idx, start_idx,
                            buckets_num, bucket_max_size);

    if (g.thread_rank() == 0) {
      *(src_offset + key_idx) = key_idx;
    }

    OccupyResult occupy_result{OccupyResult::INITIAL};
    const int bucket_size = buckets_size[bkt_idx];
    do {
      if (bucket_size < bucket_max_size) {
        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(
            g, bucket, insert_key, insert_score, evicted_key, start_idx,
            key_pos, src_lane, bucket_max_size);
      } else {
        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
        occupy_result = find_and_lock_when_full<K, V, S, TILE_SIZE,
                                                ScoreFunctor::LOCK_MEM_ORDER,
                                                ScoreFunctor::UNLOCK_MEM_ORDER>(
            g, bucket, insert_key, insert_score, evicted_key, start_idx,
            key_pos, src_lane, bucket_max_size);
      }

      occupy_result = g.shfl(occupy_result, src_lane);
    } while (occupy_result == OccupyResult::CONTINUE);

    if (occupy_result == OccupyResult::REFUSED) continue;

    if ((is_accum && occupy_result != OccupyResult::DUPLICATE) ||
        (!is_accum && occupy_result == OccupyResult::DUPLICATE)) {
      if (g.thread_rank() == src_lane) {
        if (occupy_result == OccupyResult::OCCUPIED_EMPTY) {
          evicted_key = static_cast<K>(EMPTY_KEY);
        }
        if (occupy_result == OccupyResult::OCCUPIED_RECLAIMED) {
          evicted_key = static_cast<K>(RECLAIM_KEY);
        }
        if (occupy_result == OccupyResult::DUPLICATE) {
          evicted_key = insert_key;
        }

        (bucket->keys(key_pos))
            ->store(evicted_key, ScoreFunctor::UNLOCK_MEM_ORDER);
      }
      g.sync();
      continue;
    }

    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
        g.thread_rank() == src_lane) {
      atomicAdd(&(buckets_size[bkt_idx]), 1);
    }

    if (g.thread_rank() == src_lane) {
      *(value_or_deltas + key_idx) = (bucket->vectors + key_pos * dim);
      *(founds + key_idx) = is_accum;
      bucket->digests(key_pos)[0] = get_digest<K>(insert_key);
      ScoreFunctor::update(bucket, key_pos, scores, key_idx, insert_score,
                           (occupy_result != OccupyResult::DUPLICATE));
      (bucket->keys(key_pos))
          ->store(insert_key, ScoreFunctor::UNLOCK_MEM_ORDER);
    }
  }
}

}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/core_kernels/contains.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "kernel_utils.cuh"

namespace nv {
namespace merlin {

template <typename K = uint64_t, typename V = float, typename S = uint64_t>
struct ContainsKernelParams {
  ContainsKernelParams(Bucket<K, V, S>* __restrict buckets_,
                       size_t buckets_num_, uint32_t dim_,
                       const K* __restrict keys_, bool* __restrict founds_,
                       size_t n_)
      : buckets(buckets_),
        buckets_num(buckets_num_),
        dim(dim_),
        keys(keys_),
        founds(founds_),
        n(n_) {}
  Bucket<K, V, S>* __restrict buckets;
  size_t buckets_num;
  uint32_t dim;
  const K* __restrict keys;
  bool* __restrict founds;
  size_t n;
};

// Using 32 threads to deal with one key
template <typename K = uint64_t, typename V = float, typename S = uint64_t>
__global__ void contains_kernel_pipeline(Bucket<K, V, S>* buckets,
                                         const size_t buckets_num,
                                         const int dim,
                                         const K* __restrict keys,
                                         bool* __restrict founds, size_t n) {
  constexpr int GROUP_SIZE = 32;
  constexpr int RESERVE = 16;
  constexpr int BLOCK_SIZE = 128;
  constexpr int BUCKET_SIZE = 128;
  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;
  constexpr int DIGEST_SPAN = BUCKET_SIZE / 4;

  __shared__ int sm_target_digests[BLOCK_SIZE];
  __shared__ K sm_target_keys[BLOCK_SIZE];
  __shared__ K* sm_keys_ptr[BLOCK_SIZE];
  // Reuse
  int* sm_counts = sm_target_digests;

  // Double buffer
  __shared__ uint32_t sm_probing_digests[2][GROUP_NUM * DIGEST_SPAN];
  __shared__ K sm_possible_keys[2][GROUP_NUM * RESERVE];

  // Initialization
  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());
  int groupID = threadIdx.x / GROUP_SIZE;
  int rank = g.thread_rank();
  int key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;
  if (key_idx_base >= n) return;
  int loop_num =
      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;
  if (rank < loop_num) {
    int idx_block = groupID * GROUP_SIZE + rank;
    K target_key = keys[key_idx_base + rank];
    sm_target_keys[idx_block] = target_key;
    const K hashed_key = Murmur3HashDevice(target_key);
    const uint8_t target_digest = static_cast<uint8_t>(hashed_key >> 32);
    sm_target_digests[idx_block] = static_cast<uint32_t>(target_digest);
    int global_idx = hashed_key % (buckets_num * BUCKET_SIZE);
    int bkt_idx = global_idx / BUCKET_SIZE;
    Bucket<K, V, S>* bucket = buckets + bkt_idx;
    __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(),
                            sizeof(K*));
    __pipeline_commit();
  }
  __pipeline_wait_prior(0);

  // Pipeline loading
  uint8_t* digests_ptr =
      reinterpret_cast<uint8_t*>(sm_keys_ptr[groupID * GROUP_SIZE]) -
      BUCKET_SIZE;
  __pipeline_memcpy_async(sm_probing_digests[0] + groupID * DIGEST_SPAN + rank,
                          digests_ptr + rank * 4, sizeof(uint32_t));
  __pipeline_commit();
  __pipeline_commit();

  for (int i = 0; i < loop_num; i++) {
    int key_idx_block = groupID * GROUP_SIZE + i;

    /* Step1: prefetch all digests in one bucket */
    if ((i + 1) < loop_num) {
      uint8_t* digests_ptr =
          reinterpret_cast<uint8_t*>(sm_keys_ptr[key_idx_block + 1]) -
          BUCKET_SIZE;
      __pipeline_memcpy_async(
          sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank,
          digests_ptr + rank * 4, sizeof(uint32_t));
    }
    __pipeline_commit();

    /* Step2: check digests and load possible keys */
    uint32_t target_digest = sm_target_digests[key_idx_block];
    uint32_t target_digests = __byte_perm(target_digest, target_digest, 0x0000);
    sm_counts[key_idx_block] = 0;
    __pipeline_wait_prior(2);
    uint32_t probing_digests =
        sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank];
    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests);
    uint32_t find_result = 0;
    if ((find_result_ & 0x01) != 0) find_result |= 0x01;
    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;
    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;
    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;
    int find_number = __popc(find_result);
    int group_base = 0;
    if (find_number > 0) {
      group_base = atomicAdd(sm_counts + key_idx_block, find_number);
    }
    bool gt_reserve = (group_base + find_number) > RESERVE;
    int gt_vote = g.ballot(gt_reserve);
    K* key_ptr = sm_keys_ptr[key_idx_block];
    if (gt_vote == 0) {
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = rank * 4 + digest_idx;
          __pipeline_memcpy_async(
              sm_possible_keys[same_buf(i)] + (groupID * RESERVE + group_base),
              key_ptr + key_pos, sizeof(K));
          group_base += 1;
        } else {
          break;
        }
      } while (true);
    } else {
      K target_key = sm_target_keys[key_idx_block];
      sm_counts[key_idx_block] = 0;
      int found_vote = 0;
      bool found = false;
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = rank * 4 + digest_idx;
          K possible_key = key_ptr[key_pos];
          if (possible_key == target_key) {
            found = true;
            sm_counts[key_idx_block] = 1;
            sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key;
          }
        }
        found_vote = g.ballot(found);
        if (found_vote) {
          break;
        }
        found_vote = digest_idx >= 0;
      } while (g.any(found_vote));
    }
    __pipeline_commit();

    /* Step3: check possible keys, and prefecth the value and score */
    if (i > 0) {
      key_idx_block -= 1;
      int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
      K target_key = sm_target_keys[key_idx_block];
      int possible_num = sm_counts[key_idx_block];
      __pipeline_wait_prior(2);
      bool found_flag = false;
      if (rank < possible_num) {
        K possible_key =
            sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank];
        if (possible_key == target_key) {
          found_flag = true;
        }
      }
      int found_vote = g.ballot(found_flag);
      founds[key_idx_grid] = (found_vote > 0);
    }
  }  // End loop

  /* Pipeline emptying: step3, i = loop_num */
  {
    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);
    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
    K target_key = sm_target_keys[key_idx_block];
    int possible_num = sm_counts[key_idx_block];
    __pipeline_wait_prior(0);
    bool found_flag = false;
    if (rank < possible_num) {
      K possible_key =
          sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank];
      if (target_key == possible_key) {
        found_flag = true;
      }
    }
    int found_vote = g.ballot(found_flag);
    founds[key_idx_grid] = (found_vote > 0);
  }

}  // End function

template <typename K, typename V, typename S>
struct LaunchPipelineContains {
  static void launch_kernel(ContainsKernelParams<K, V, S>& params,
                            cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    // Using 32 threads to deal with one key
    contains_kernel_pipeline<K, V, S>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
            params.buckets, params.buckets_num, params.dim, params.keys,
            params.founds, params.n);
  }
};

template <typename K, typename V, typename S = uint64_t,
          typename ArchTag = Sm80>
struct SelectPipelineContainsKernel {
  static void select_kernel(ContainsKernelParams<K, V, S>& params,
                            cudaStream_t& stream) {
    LaunchPipelineContains<K, V, S>::launch_kernel(params, stream);
  }
};

template <class K, class V, class S, uint32_t TILE_SIZE = 4>
__global__ void contains_kernel(const Table<K, V, S>* __restrict table,
                                Bucket<K, V, S>* buckets,
                                const size_t bucket_max_size,
                                const size_t buckets_num, const size_t dim,
                                const K* __restrict keys,
                                bool* __restrict found, size_t N) {
  int* buckets_size = table->buckets_size;

  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int rank = g.thread_rank();

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_idx = t / TILE_SIZE;

    const K find_key = keys[key_idx];
    if (IS_RESERVED_KEY<K>(find_key)) continue;

    int key_pos = -1;
    int src_lane = -1;
    size_t bkt_idx = 0;
    size_t start_idx = 0;

    Bucket<K, V, S>* bucket = get_key_position<K>(
        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);

    const int bucket_size = buckets_size[bkt_idx];
    if (bucket_size >= bucket_max_size) {
      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
    }

    OccupyResult occupy_result{OccupyResult::INITIAL};
    occupy_result = find_without_lock<K, V, S, TILE_SIZE>(
        g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size);

    if (rank == src_lane) {
      *(found + key_idx) = (occupy_result == OccupyResult::DUPLICATE);
    }
  }
}

template <typename K, typename V, typename S>
struct SelectContainsKernel {
  static void execute_kernel(const float& load_factor, const int& block_size,
                             const size_t bucket_max_size,
                             const size_t buckets_num, const size_t dim,
                             cudaStream_t& stream, const size_t& n,
                             const Table<K, V, S>* __restrict table,
                             Bucket<K, V, S>* buckets, const K* __restrict keys,
                             bool* __restrict found) {
    if (load_factor <= 0.75) {
      const unsigned int tile_size = 4;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      contains_kernel<K, V, S, tile_size><<<grid_size, block_size, 0, stream>>>(
          table, buckets, bucket_max_size, buckets_num, dim, keys, found, N);
    } else {
      const unsigned int tile_size = 16;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      contains_kernel<K, V, S, tile_size><<<grid_size, block_size, 0, stream>>>(
          table, buckets, bucket_max_size, buckets_num, dim, keys, found, N);
    }
    return;
  }
};

}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/core_kernels/dual_bucket_lookup.cuh
================================================
/*
 * Copyright (c) 2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "dual_bucket_utils.cuh"
#include "kernel_utils.cuh"

namespace nv {
namespace merlin {

/**
 * Dual-bucket pipeline lookup kernel (sequential two-bucket search).
 *
 * For each key, computes (b1, b2) via high/low 32-bit split of Murmur3 hash.
 * First probes b1; if not found, probes b2.
 * Uses dual_bucket_digest (bit[56:63]) to avoid digest collision with b2
 * addressing.
 *
 * Architecture: Based on lookup_kernel_with_io_pipeline_v1 with 32 threads
 * per key, 128-thread blocks, 128-slot buckets. 4-stage IO pipeline
 * (prefetch digests -> digest match + key load -> key verify + value prefetch
 * -> value writeback).
 */
template <class K, class V, class S, class VecV,
          typename CopyScore = CopyScoreEmpty<S, K, 128>,
          typename CopyValue = CopyValueTwoGroup<VecV, 32>,
          typename FoundFunctor = FoundFunctorV1<K>, int VALUE_BUF = 56>
__global__ void dual_bucket_pipeline_lookup_kernel_with_io(
    Bucket<K, V, S>* buckets, const int32_t* __restrict__ buckets_size,
    const size_t buckets_num, const int dim, const K* __restrict keys,
    VecV* __restrict values, S* __restrict scores, FoundFunctor found_functor,
    size_t n) {
  constexpr int GROUP_SIZE = 32;
  constexpr int RESERVE = 16;
  constexpr int BLOCK_SIZE = 128;
  constexpr int BUCKET_SIZE = 128;
  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;
  constexpr int DIGEST_SPAN = BUCKET_SIZE / 4;

  using BUCKET = Bucket<K, V, S>;

  // Shared memory declarations.
  __shared__ int sm_target_digests[BLOCK_SIZE];
  __shared__ K sm_target_keys[BLOCK_SIZE];
  __shared__ K* sm_keys_ptr1[BLOCK_SIZE];       // b1 bucket keys ptr
  __shared__ K* sm_keys_ptr2[BLOCK_SIZE];       // b2 bucket keys ptr
  __shared__ VecV* sm_values_ptr1[BLOCK_SIZE];  // b1 values ptr
  __shared__ VecV* sm_values_ptr2[BLOCK_SIZE];  // b2 values ptr
  __shared__ S sm_target_scores[BLOCK_SIZE];
  // Reuse sm_target_digests
  int* sm_counts = sm_target_digests;
  int* sm_founds = sm_counts;
  // Double buffer
  __shared__ uint32_t sm_probing_digests[2][GROUP_NUM * DIGEST_SPAN];
  __shared__ K sm_possible_keys[2][GROUP_NUM * RESERVE];
  __shared__ int sm_possible_pos[2][GROUP_NUM * RESERVE];
  __shared__ VecV sm_vector[2][GROUP_NUM][VALUE_BUF];

  // Initialization.
  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());
  int groupID = threadIdx.x / GROUP_SIZE;
  int rank = g.thread_rank();
  int key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;
  if (key_idx_base >= n) return;
  int loop_num =
      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;

  // Phase 1: Initialize per-key data (hash, digest, bucket pointers).
  // Save digest in register to avoid recomputing Murmur3 hash in Pass 2
  // (sm_target_digests is aliased with sm_counts/sm_founds and gets
  // corrupted during Pass 1).
  uint32_t reg_target_digest = 0;
  if (rank < loop_num) {
    int idx_block = groupID * GROUP_SIZE + rank;
    K target_key = keys[key_idx_base + rank];
    sm_target_keys[idx_block] = target_key;
    const K hashed_key = Murmur3HashDevice(target_key);

    // Dual-bucket digest: bit[56:63]
    const uint8_t target_digest =
        static_cast<uint8_t>(static_cast<uint64_t>(hashed_key) >> 56);
    reg_target_digest = static_cast<uint32_t>(target_digest);
    sm_target_digests[idx_block] = reg_target_digest;

    // Dual-bucket positions (centralized in dual_bucket_utils.cuh).
    size_t bkt_idx1, bkt_idx2;
    get_dual_bucket_indices<K>(hashed_key, buckets_num, bkt_idx1, bkt_idx2);

    BUCKET* bucket1 = buckets + bkt_idx1;
    BUCKET* bucket2 = buckets + bkt_idx2;
    sm_keys_ptr1[idx_block] = reinterpret_cast<K*>(bucket1->keys(0));
    sm_keys_ptr2[idx_block] = reinterpret_cast<K*>(bucket2->keys(0));
    __pipeline_memcpy_async(sm_values_ptr1 + idx_block, &(bucket1->vectors),
                            sizeof(VecV*));
    __pipeline_commit();
    __pipeline_memcpy_async(sm_values_ptr2 + idx_block, &(bucket2->vectors),
                            sizeof(VecV*));
  }
  __pipeline_wait_prior(0);

  // Helper lambda-like function to run pipeline lookup on one bucket.
  // We process keys sequentially through the pipeline for one bucket,
  // then process missed keys through the second bucket.

  // --- PASS 1: Search bucket b1 ---
  // Pipeline loading for b1.
  {
    uint8_t* digests_ptr =
        reinterpret_cast<uint8_t*>(sm_keys_ptr1[groupID * GROUP_SIZE]) -
        BUCKET_SIZE;
    __pipeline_memcpy_async(
        sm_probing_digests[0] + groupID * DIGEST_SPAN + rank,
        digests_ptr + rank * 4, sizeof(uint32_t));
  }
  __pipeline_commit();
  __pipeline_commit();
  __pipeline_commit();

  for (int i = 0; i < loop_num; i++) {
    int key_idx_block = groupID * GROUP_SIZE + i;

    // Step1: prefetch digests for next key's b1 bucket.
    if ((i + 1) < loop_num) {
      uint8_t* digests_ptr =
          reinterpret_cast<uint8_t*>(sm_keys_ptr1[key_idx_block + 1]) -
          BUCKET_SIZE;
      __pipeline_memcpy_async(
          sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank,
          digests_ptr + rank * 4, sizeof(uint32_t));
    }
    __pipeline_commit();

    // Step2: check digests and load possible keys.
    uint32_t target_digest = sm_target_digests[key_idx_block];
    uint32_t target_digests_vec =
        __byte_perm(target_digest, target_digest, 0x0000);
    sm_counts[key_idx_block] = 0;
    __pipeline_wait_prior(3);
    uint32_t probing_digests =
        sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank];
    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests_vec);
    uint32_t find_result = 0;
    if ((find_result_ & 0x01) != 0) find_result |= 0x01;
    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;
    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;
    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;
    int find_number = __popc(find_result);
    int group_base = 0;
    if (find_number > 0) {
      group_base = atomicAdd(sm_counts + key_idx_block, find_number);
    }
    bool gt_reserve = (group_base + find_number) > RESERVE;
    int gt_vote = g.ballot(gt_reserve);
    K* key_ptr = sm_keys_ptr1[key_idx_block];
    if (gt_vote == 0) {
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = rank * 4 + digest_idx;
          sm_possible_pos[same_buf(i)][groupID * RESERVE + group_base] =
              key_pos;
          __pipeline_memcpy_async(
              sm_possible_keys[same_buf(i)] + (groupID * RESERVE + group_base),
              key_ptr + key_pos, sizeof(K));
          group_base += 1;
        } else {
          break;
        }
      } while (true);
    } else {
      K target_key = sm_target_keys[key_idx_block];
      sm_counts[key_idx_block] = 0;
      int found_vote = 0;
      bool found = false;
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = rank * 4 + digest_idx;
          K possible_key = key_ptr[key_pos];
          if (possible_key == target_key) {
            found = true;
            sm_counts[key_idx_block] = 1;
            sm_possible_pos[same_buf(i)][groupID * RESERVE] = key_pos;
            sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key;
          }
        }
        found_vote = g.ballot(found);
        if (found_vote) break;
        found_vote = digest_idx >= 0;
      } while (g.any(found_vote));
    }
    __pipeline_commit();

    // Step3: verify keys, prefetch values.
    if (i > 0) {
      int prev_block = groupID * GROUP_SIZE + i - 1;
      K target_key = sm_target_keys[prev_block];
      int possible_num = sm_counts[prev_block];
      sm_founds[prev_block] = 0;
      S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr1, prev_block);
      VecV* value_ptr = sm_values_ptr1[prev_block];
      __pipeline_wait_prior(3);
      int key_pos;
      bool found_flag = false;
      if (rank < possible_num) {
        K possible_key =
            sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank];
        key_pos = sm_possible_pos[diff_buf(i)][groupID * RESERVE + rank];
        if (possible_key == target_key) {
          found_flag = true;
          CopyScore::ldg_sts(sm_target_scores + prev_block,
                             score_ptr + key_pos);
        }
      }
      int found_vote = g.ballot(found_flag);
      if (found_vote) {
        VecV* v_dst = sm_vector[diff_buf(i)][groupID];
        sm_founds[prev_block] = 1;
        int src_lane = __ffs(found_vote) - 1;
        int target_pos = g.shfl(key_pos, src_lane);
        VecV* v_src = value_ptr + target_pos * dim;
        CopyValue::ldg_sts(rank, v_dst, v_src, dim);
      }
    }
    __pipeline_commit();

    // Step4: write back value and score.
    if (i > 1) {
      int wb_block = groupID * GROUP_SIZE + i - 2;
      int key_idx_grid = blockIdx.x * blockDim.x + wb_block;
      VecV* v_src = sm_vector[same_buf(i)][groupID];
      VecV* v_dst = values + key_idx_grid * dim;
      int found_flag = sm_founds[wb_block];
      __pipeline_wait_prior(3);
      if (found_flag > 0) {
        S score_ = CopyScore::lgs(sm_target_scores + wb_block);
        CopyValue::lds_stg(rank, v_dst, v_src, dim);
        CopyScore::stg(scores + key_idx_grid, score_);
      }
    }
  }

  // Pipeline emptying for b1: step3 for last key.
  {
    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);
    K target_key = sm_target_keys[key_idx_block];
    int possible_num = sm_counts[key_idx_block];
    sm_founds[key_idx_block] = 0;
    S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr1, key_idx_block);
    VecV* value_ptr = sm_values_ptr1[key_idx_block];
    __pipeline_wait_prior(1);
    int key_pos;
    bool found_flag = false;
    if (rank < possible_num) {
      key_pos = sm_possible_pos[diff_buf(loop_num)][groupID * RESERVE + rank];
      K possible_key =
          sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank];
      if (target_key == possible_key) {
        found_flag = true;
        CopyScore::ldg_sts(sm_target_scores + key_idx_block,
                           score_ptr + key_pos);
      }
    }
    int found_vote = g.ballot(found_flag);
    if (found_vote) {
      sm_founds[key_idx_block] = 1;
      int src_lane = __ffs(found_vote) - 1;
      int target_pos = g.shfl(key_pos, src_lane);
      VecV* v_src = value_ptr + target_pos * dim;
      VecV* v_dst = sm_vector[diff_buf(loop_num)][groupID];
      CopyValue::ldg_sts(rank, v_dst, v_src, dim);
    }
  }
  __pipeline_commit();

  // Pipeline emptying: step4 for second-to-last key.
  if (loop_num > 1) {
    int key_idx_block = groupID * GROUP_SIZE + loop_num - 2;
    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
    VecV* v_src = sm_vector[same_buf(loop_num)][groupID];
    VecV* v_dst = values + key_idx_grid * dim;
    int found_flag = sm_founds[key_idx_block];
    __pipeline_wait_prior(1);
    if (found_flag > 0) {
      S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);
      CopyValue::lds_stg(rank, v_dst, v_src, dim);
      CopyScore::stg(scores + key_idx_grid, score_);
    }
  }

  // Pipeline emptying: step4 for last key.
  {
    int key_idx_block = groupID * GROUP_SIZE + loop_num - 1;
    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
    VecV* v_src = sm_vector[same_buf(loop_num + 1)][groupID];
    VecV* v_dst = values + key_idx_grid * dim;
    int found_flag = sm_founds[key_idx_block];
    __pipeline_wait_prior(0);
    if (found_flag > 0) {
      S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);
      CopyValue::lds_stg(rank, v_dst, v_src, dim);
      CopyScore::stg(scores + key_idx_grid, score_);
    }
  }

  // Finalize b1 pass and record found status.
  // Keys found in b1 are marked. Unfound keys need b2 search.
  if (rank < loop_num) {
    int key_idx_block = groupID * GROUP_SIZE + rank;
    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
    // Only write found for b1 hits; b2 pass will handle misses.
    if (sm_founds[key_idx_block] > 0) {
      found_functor(key_idx_grid, sm_target_keys[key_idx_block], true);
    }
  }

  // --- PASS 2: Search bucket b2 for keys not found in b1 ---
  // Count unfound keys. If all found in b1, skip b2 entirely.
  int any_unfound = 0;
  if (rank < loop_num) {
    int key_idx_block = groupID * GROUP_SIZE + rank;
    if (sm_founds[key_idx_block] == 0) {
      any_unfound = 1;
    }
  }
  any_unfound = g.any(any_unfound);
  if (!any_unfound) return;

  // Save b1 found flags (sm_founds will be reused).
  // We use a simple approach: store per-thread found flag in register.
  int b1_found = 0;
  if (rank < loop_num) {
    b1_found = sm_founds[groupID * GROUP_SIZE + rank];
  }

  // Restore digests from registers saved during Phase 1 init.
  // sm_target_digests was aliased with sm_counts/sm_founds and corrupted
  // during Pass 1.  Using the register avoids recomputing Murmur3 hash.
  if (rank < loop_num) {
    int idx_block = groupID * GROUP_SIZE + rank;
    sm_target_digests[idx_block] = reg_target_digest;
  }
  __syncwarp();

  // Pipeline loading for b2.
  {
    uint8_t* digests_ptr =
        reinterpret_cast<uint8_t*>(sm_keys_ptr2[groupID * GROUP_SIZE]) -
        BUCKET_SIZE;
    __pipeline_memcpy_async(
        sm_probing_digests[0] + groupID * DIGEST_SPAN + rank,
        digests_ptr + rank * 4, sizeof(uint32_t));
  }
  __pipeline_commit();
  __pipeline_commit();
  __pipeline_commit();

  for (int i = 0; i < loop_num; i++) {
    int key_idx_block = groupID * GROUP_SIZE + i;
    // Check if this key was already found in b1.
    int skip = g.shfl(b1_found, i);

    // Step1: prefetch digests for next key's b2 bucket.
    if ((i + 1) < loop_num) {
      uint8_t* digests_ptr =
          reinterpret_cast<uint8_t*>(sm_keys_ptr2[key_idx_block + 1]) -
          BUCKET_SIZE;
      __pipeline_memcpy_async(
          sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank,
          digests_ptr + rank * 4, sizeof(uint32_t));
    }
    __pipeline_commit();

    // Step2: check digests and load possible keys (skip if found in b1).
    // Read digest BEFORE zeroing sm_counts (they alias sm_target_digests).
    uint32_t target_digest = sm_target_digests[key_idx_block];
    sm_counts[key_idx_block] = 0;
    if (!skip) {
      uint32_t target_digests_vec =
          __byte_perm(target_digest, target_digest, 0x0000);
      __pipeline_wait_prior(3);
      uint32_t probing_digests =
          sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank];
      uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests_vec);
      uint32_t find_result = 0;
      if ((find_result_ & 0x01) != 0) find_result |= 0x01;
      if ((find_result_ & 0x0100) != 0) find_result |= 0x02;
      if ((find_result_ & 0x010000) != 0) find_result |= 0x04;
      if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;
      int find_number = __popc(find_result);
      int group_base = 0;
      if (find_number > 0) {
        group_base = atomicAdd(sm_counts + key_idx_block, find_number);
      }
      bool gt_reserve = (group_base + find_number) > RESERVE;
      int gt_vote = g.ballot(gt_reserve);
      K* key_ptr = sm_keys_ptr2[key_idx_block];
      if (gt_vote == 0) {
        do {
          int digest_idx = __ffs(find_result) - 1;
          if (digest_idx >= 0) {
            find_result &= (find_result - 1);
            int key_pos = rank * 4 + digest_idx;
            sm_possible_pos[same_buf(i)][groupID * RESERVE + group_base] =
                key_pos;
            __pipeline_memcpy_async(sm_possible_keys[same_buf(i)] +
                                        (groupID * RESERVE + group_base),
                                    key_ptr + key_pos, sizeof(K));
            group_base += 1;
          } else {
            break;
          }
        } while (true);
      } else {
        K target_key = sm_target_keys[key_idx_block];
        sm_counts[key_idx_block] = 0;
        int found_vote = 0;
        bool found = false;
        do {
          int digest_idx = __ffs(find_result) - 1;
          if (digest_idx >= 0) {
            find_result &= (find_result - 1);
            int key_pos = rank * 4 + digest_idx;
            K possible_key = key_ptr[key_pos];
            if (possible_key == target_key) {
              found = true;
              sm_counts[key_idx_block] = 1;
              sm_possible_pos[same_buf(i)][groupID * RESERVE] = key_pos;
              sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key;
            }
          }
          found_vote = g.ballot(found);
          if (found_vote) break;
          found_vote = digest_idx >= 0;
        } while (g.any(found_vote));
      }
    } else {
      __pipeline_wait_prior(3);
    }
    __pipeline_commit();

    // Step3: verify keys and prefetch values from b2.
    if (i > 0) {
      int prev_block = groupID * GROUP_SIZE + i - 1;
      int prev_skip = g.shfl(b1_found, i - 1);
      if (!prev_skip) {
        K target_key = sm_target_keys[prev_block];
        // Read count BEFORE zeroing (sm_counts aliases sm_founds).
        int possible_num = sm_counts[prev_block];
        sm_founds[prev_block] = 0;
        S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr2, prev_block);
        VecV* value_ptr = sm_values_ptr2[prev_block];
        __pipeline_wait_prior(3);
        int key_pos;
        bool found_flag = false;
        if (rank < possible_num) {
          K possible_key =
              sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank];
          key_pos = sm_possible_pos[diff_buf(i)][groupID * RESERVE + rank];
          if (possible_key == target_key) {
            found_flag = true;
            CopyScore::ldg_sts(sm_target_scores + prev_block,
                               score_ptr + key_pos);
          }
        }
        int found_vote = g.ballot(found_flag);
        if (found_vote) {
          VecV* v_dst = sm_vector[diff_buf(i)][groupID];
          sm_founds[prev_block] = 1;
          int src_lane = __ffs(found_vote) - 1;
          int target_pos = g.shfl(key_pos, src_lane);
          VecV* v_src = value_ptr + target_pos * dim;
          CopyValue::ldg_sts(rank, v_dst, v_src, dim);
        }
      } else {
        __pipeline_wait_prior(3);
      }
    }
    __pipeline_commit();

    // Step4: write back values from b2.
    if (i > 1) {
      int wb_block = groupID * GROUP_SIZE + i - 2;
      int prev_skip = g.shfl(b1_found, i - 2);
      if (!prev_skip) {
        int key_idx_grid = blockIdx.x * blockDim.x + wb_block;
        VecV* v_src = sm_vector[same_buf(i)][groupID];
        VecV* v_dst = values + key_idx_grid * dim;
        int found_flag = sm_founds[wb_block];
        __pipeline_wait_prior(3);
        if (found_flag > 0) {
          S score_ = CopyScore::lgs(sm_target_scores + wb_block);
          CopyValue::lds_stg(rank, v_dst, v_src, dim);
          CopyScore::stg(scores + key_idx_grid, score_);
        }
      } else {
        __pipeline_wait_prior(3);
      }
    }
  }

  // Pipeline emptying for b2: step3 for last key.
  {
    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);
    int last_skip = g.shfl(b1_found, loop_num - 1);
    if (!last_skip) {
      K target_key = sm_target_keys[key_idx_block];
      // Read count BEFORE zeroing (sm_counts aliases sm_founds).
      int possible_num = sm_counts[key_idx_block];
      sm_founds[key_idx_block] = 0;
      S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr2, key_idx_block);
      VecV* value_ptr = sm_values_ptr2[key_idx_block];
      __pipeline_wait_prior(1);
      int key_pos;
      bool found_flag = false;
      if (rank < possible_num) {
        key_pos = sm_possible_pos[diff_buf(loop_num)][groupID * RESERVE + rank];
        K possible_key =
            sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank];
        if (target_key == possible_key) {
          found_flag = true;
          CopyScore::ldg_sts(sm_target_scores + key_idx_block,
                             score_ptr + key_pos);
        }
      }
      int found_vote = g.ballot(found_flag);
      if (found_vote) {
        sm_founds[key_idx_block] = 1;
        int src_lane = __ffs(found_vote) - 1;
        int target_pos = g.shfl(key_pos, src_lane);
        VecV* v_src = value_ptr + target_pos * dim;
        VecV* v_dst = sm_vector[diff_buf(loop_num)][groupID];
        CopyValue::ldg_sts(rank, v_dst, v_src, dim);
      }
    } else {
      __pipeline_wait_prior(1);
    }
  }
  __pipeline_commit();

  // Pipeline emptying: step4 for second-to-last key.
  if (loop_num > 1) {
    int key_idx_block = groupID * GROUP_SIZE + loop_num - 2;
    int prev_skip = g.shfl(b1_found, loop_num - 2);
    if (!prev_skip) {
      int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
      VecV* v_src = sm_vector[same_buf(loop_num)][groupID];
      VecV* v_dst = values + key_idx_grid * dim;
      int found_flag = sm_founds[key_idx_block];
      __pipeline_wait_prior(1);
      if (found_flag > 0) {
        S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);
        CopyValue::lds_stg(rank, v_dst, v_src, dim);
        CopyScore::stg(scores + key_idx_grid, score_);
      }
    } else {
      __pipeline_wait_prior(1);
    }
  }

  // Pipeline emptying: step4 for last key.
  {
    int key_idx_block = groupID * GROUP_SIZE + loop_num - 1;
    int last_skip = g.shfl(b1_found, loop_num - 1);
    if (!last_skip) {
      int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
      VecV* v_src = sm_vector[same_buf(loop_num + 1)][groupID];
      VecV* v_dst = values + key_idx_grid * dim;
      int found_flag = sm_founds[key_idx_block];
      __pipeline_wait_prior(0);
      if (found_flag > 0) {
        S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);
        CopyValue::lds_stg(rank, v_dst, v_src, dim);
        CopyScore::stg(scores + key_idx_grid, score_);
      }
    } else {
      __pipeline_wait_prior(0);
    }
  }

  // Finalize b2 pass: report found for keys found in b2.
  if (rank < loop_num) {
    int key_idx_block = groupID * GROUP_SIZE + rank;
    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
    if (b1_found == 0) {
      // Key was not found in b1; report b2 result.
      found_functor(key_idx_grid, sm_target_keys[key_idx_block],
                    sm_founds[key_idx_block] > 0);
    }
  }
}

// --- Kernel Launchers ---

template <typename K, typename V, typename S, typename CopyScore, typename VecV,
          uint32_t ValueBufSize>
struct LaunchDualBucketLookupV1 {
  template <template <typename, typename, typename> typename LookupKernelParams>
  static void launch_kernel(LookupKernelParams<K, V, S>& params,
                            const int32_t* buckets_size, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    constexpr int GROUP_SIZE = 32;
    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    constexpr uint32_t VecSize = ValueBufSize / sizeof(VecV);
    if (params.dim > (GROUP_SIZE * 2)) {
      using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;
      dual_bucket_pipeline_lookup_kernel_with_io<
          K, V, S, VecV, CopyScore, CopyValue, decltype(params.found_functor),
          VecSize>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, buckets_size, params.buckets_num, params.dim,
              params.keys, reinterpret_cast<VecV*>(params.values),
              params.scores, params.found_functor, params.n);
    } else if (params.dim > GROUP_SIZE) {
      using CopyValue = CopyValueTwoGroup<VecV, GROUP_SIZE>;
      dual_bucket_pipeline_lookup_kernel_with_io<
          K, V, S, VecV, CopyScore, CopyValue, decltype(params.found_functor),
          VecSize>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, buckets_size, params.buckets_num, params.dim,
              params.keys, reinterpret_cast<VecV*>(params.values),
              params.scores, params.found_functor, params.n);
    } else {
      using CopyValue = CopyValueOneGroup<VecV, GROUP_SIZE>;
      dual_bucket_pipeline_lookup_kernel_with_io<
          K, V, S, VecV, CopyScore, CopyValue, decltype(params.found_functor),
          VecSize>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, buckets_size, params.buckets_num, params.dim,
              params.keys, reinterpret_cast<VecV*>(params.values),
              params.scores, params.found_functor, params.n);
    }
  }
};

// --- Kernel Selector ---

template <typename K, typename V, typename S = uint64_t,
          typename ArchTag = Sm80>
struct SelectDualBucketLookupKernel {
  using ValueBufConfig = LookupValueBufConfig<ArchTag>;

  static inline uint32_t max_value_size() {
    return ValueBufConfig::size_pipeline_v1;
  }

  template <template <typename, typename, typename> typename LookupKernelParams>
  static void select_kernel(LookupKernelParams<K, V, S>& params,
                            const int32_t* buckets_size, cudaStream_t& stream) {
    constexpr int BUCKET_SIZE = 128;
    constexpr uint32_t buf_size_v1 = ValueBufConfig::size_pipeline_v1;

    uint32_t total_value_size = static_cast<uint32_t>(params.dim * sizeof(V));

    // For dual-bucket lookup, we use v1 kernel (32 threads/key) only.
    if (params.scores == nullptr) {
      using CopyScore = CopyScoreEmpty<S, K, BUCKET_SIZE>;
      if (total_value_size % sizeof(float4) == 0) {
        using VecV = float4;
        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params,
                                                             buckets_size,
                                                             stream);
      } else if (total_value_size % sizeof(float2) == 0) {
        using VecV = float2;
        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params,
                                                             buckets_size,
                                                             stream);
      } else if (total_value_size % sizeof(float) == 0) {
        using VecV = float;
        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params,
                                                             buckets_size,
                                                             stream);
      } else if (total_value_size % sizeof(uint16_t) == 0) {
        using VecV = uint16_t;
        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params,
                                                             buckets_size,
                                                             stream);
      } else {
        using VecV = uint8_t;
        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params,
                                                             buckets_size,
                                                             stream);
      }
    } else {
      using CopyScore = CopyScoreByPassCache<S, K, BUCKET_SIZE>;
      if (total_value_size % sizeof(float4) == 0) {
        using VecV = float4;
        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params,
                                                             buckets_size,
                                                             stream);
      } else if (total_value_size % sizeof(float2) == 0) {
        using VecV = float2;
        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params,
                                                             buckets_size,
                                                             stream);
      } else if (total_value_size % sizeof(float) == 0) {
        using VecV = float;
        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params,
                                                             buckets_size,
                                                             stream);
      } else if (total_value_size % sizeof(uint16_t) == 0) {
        using VecV = uint16_t;
        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params,
                                                             buckets_size,
                                                             stream);
      } else {
        using VecV = uint8_t;
        LaunchDualBucketLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params,
                                                             buckets_size,
                                                             stream);
      }
    }
  }
};

}  // namespace merlin
}  // namespace nv


================================================
FILE: include/merlin/core_kernels/dual_bucket_upsert.cuh
================================================
/*
 * Copyright (c) 2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "dual_bucket_utils.cuh"
#include "kernel_utils.cuh"

namespace nv {
namespace merlin {

/**
 * Dual-bucket pipeline upsert kernel — True Two-Choice.
 *
 * Implements dual-bucket insert_or_assign with three distinct phases:
 *   Phase 0: DUPLICATE detection in BOTH buckets (no empty-slot occupation)
 *   Phase 1: D1 Two-Choice load-balance — compare bucket sizes, insert into
 *            the emptier bucket first, fallback to the other
 *   Phase 2: D2 score-eviction — when both buckets full, evict the entry
 *            with the global minimum score across both buckets
 *
 * Key invariant: DUPLICATE search completes in BOTH buckets before any
 * empty-slot insertion attempt. This ensures correct insert_or_assign
 * semantics (no spurious duplicates across buckets).
 *
 * Concurrent model: pure slot-level CAS (no per-bucket Mutex).
 * Constraint: unique_key=true (caller guarantees no duplicate keys in batch).
 *
 * Based on pipeline_upsert_kernel_with_io architecture:
 * - 32 threads per key (GROUP_SIZE)
 * - 128-thread blocks
 * - 128-slot buckets
 * - 4-stage software pipeline
 */
template <class K, class V, class S, class VecV, int BLOCK_SIZE = 128,
          int Strategy = 0>
__global__ void dual_bucket_pipeline_upsert_kernel_with_io(
    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,
    const uint64_t buckets_num, const uint32_t dim, const K* __restrict__ keys,
    const VecV* __restrict__ values, const S* __restrict__ scores, uint64_t n,
    const S global_epoch) {
  constexpr uint32_t BUCKET_SIZE = 128;
  constexpr uint32_t GROUP_SIZE = 32;
  constexpr uint32_t Comp_LEN = sizeof(VecD_Comp) / sizeof(D);
  constexpr uint32_t Load_LEN = sizeof(VecD_Load) / sizeof(D);
  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);

  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;
  using SMM = SharedMemoryManager_Pipeline_Upsert<K, V, S, VecV, BLOCK_SIZE,
                                                  GROUP_SIZE, BUCKET_SIZE>;
  using ScoreFunctor_ = ScoreFunctor<K, V, S, Strategy>;

  __shared__ extern __align__(alignof(byte16)) byte smem[];

  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());
  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  VecD_Comp target_digests;
  K* bucket_keys_ptr1{nullptr};
  K* bucket_keys_ptr2{nullptr};
  VecV* bucket_values_ptr2{nullptr};
  int* bucket_size_ptr2{nullptr};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  uint32_t key_pos = 0;
  uint32_t key_pos2 = 0;  // b2 start position (independent from b1)
  int target_bucket = 1;  // 1 = b1, 2 = b2

  if (kv_idx < n) {
    key = keys[kv_idx];
    if (scores != nullptr) {
      S* sm_param_scores = SMM::param_scores(smem);
      __pipeline_memcpy_async(sm_param_scores + tx, scores + kv_idx, sizeof(S));
    }
    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      // Dual-bucket digest from bit[56:63].
      target_digests = dual_bucket_digests_from_hashed<K>(hashed_key);

      // Dual-bucket indices (centralized in dual_bucket_utils.cuh).
      size_t bkt_idx1, bkt_idx2;
      get_dual_bucket_indices<K>(hashed_key, buckets_num, bkt_idx1, bkt_idx2);

      // b1 setup (stored in SMM shared memory).
      const uint32_t lo = static_cast<uint32_t>(hashed_key);
      uint64_t global_idx1 =
          static_cast<uint64_t>(lo % (buckets_num * BUCKET_SIZE));
      key_pos = get_start_position(global_idx1, BUCKET_SIZE);

      // b2 start position from high 32 bits (independent from b1).
      const uint32_t hi =
          static_cast<uint32_t>(static_cast<uint64_t>(hashed_key) >> 32);
      uint64_t global_idx2 =
          static_cast<uint64_t>(hi % (buckets_num * BUCKET_SIZE));
      key_pos2 = get_start_position(global_idx2, BUCKET_SIZE);

      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
      sm_buckets_size_ptr[tx] = buckets_size + bkt_idx1;

      BUCKET* bucket1 = buckets + bkt_idx1;
      bucket_keys_ptr1 = reinterpret_cast<K*>(bucket1->keys(0));
      VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
      __pipeline_memcpy_async(sm_bucket_values_ptr + tx, &(bucket1->vectors),
                              sizeof(VecV*));

      // b2 setup (stored in registers, broadcast via warp shuffle).
      BUCKET* bucket2 = buckets + bkt_idx2;
      bucket_keys_ptr2 = reinterpret_cast<K*>(bucket2->keys(0));
      bucket_values_ptr2 = reinterpret_cast<VecV*>(bucket2->vectors);
      bucket_size_ptr2 = buckets_size + bkt_idx2;
    } else {
      occupy_result = OccupyResult::ILLEGAL;
    }
  } else {
    occupy_result = OccupyResult::ILLEGAL;
  }

  uint32_t rank = g.thread_rank();
  uint32_t groupID = threadIdx.x / GROUP_SIZE;

  // =========== Main pipeline loop (processes one key per iteration)
  // =========== True Two-Choice algorithm for each key i in the warp:
  //   Phase 0: DUPLICATE detection in BOTH b1 and b2 (no empty occupation)
  //   Phase 1: D1 Two-Choice — compare bucket sizes, try emptier bucket first
  //   Phase 2: D2 score-eviction when both buckets are full

  auto occupy_result_next = g.shfl(occupy_result, 0);
  auto keys_ptr_next = g.shfl(bucket_keys_ptr1, 0);

  // Prefetch b1 digests for first key.
  if (occupy_result_next == OccupyResult::INITIAL) {
    D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, 0);
    D* dst = sm_bucket_digests + rank * Load_LEN;
    D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);
    if (rank * Load_LEN < BUCKET_SIZE) {
      __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));
    }
  }
  __pipeline_commit();
  __pipeline_commit();
  __pipeline_commit();

  for (int32_t i = 0; i < GROUP_SIZE; i++) {
    // === Step 1: Prefetch b1 digests for next key ===
    if (i + 1 < GROUP_SIZE) {
      auto occupy_result_next = g.shfl(occupy_result, i + 1);
      auto keys_ptr_next = g.shfl(bucket_keys_ptr1, i + 1);
      if (occupy_result_next == OccupyResult::INITIAL) {
        D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, diff_buf(i));
        D* dst = sm_bucket_digests + rank * Load_LEN;
        D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);
        if (rank * Load_LEN < BUCKET_SIZE) {
          __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));
        }
      }
    }
    __pipeline_commit();

    // === Step 2: Three-phase True Two-Choice probe ===
    auto occupy_result_cur = g.shfl(occupy_result, i);
    if (occupy_result_cur == OccupyResult::INITIAL) {
      uint32_t tx_cur = groupID * GROUP_SIZE + i;
      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
      auto bucket_size_ptr1 = sm_buckets_size_ptr[tx_cur];
      K key_cur = g.shfl(key, i);
      auto target_digests_cur = g.shfl(target_digests, i);
      auto start_pos_cur = g.shfl(key_pos, i);
      auto keys_ptr_cur = g.shfl(bucket_keys_ptr1, i);

      // b2 info for key i (shuffled from owning thread).
      auto keys_ptr2_cur = g.shfl(bucket_keys_ptr2, i);
      auto bsize_ptr2_cur = reinterpret_cast<int*>(static_cast<uintptr_t>(
          g.shfl(static_cast<unsigned long long>(
                     reinterpret_cast<uintptr_t>(bucket_size_ptr2)),
                 i)));
      auto start_pos2_cur = g.shfl(key_pos2, i);

      __pipeline_wait_prior(3);
      D* digest_src = SMM::bucket_digests(smem, groupID, same_buf(i));

      // b1 probe offset (from b1's hash).
      uint32_t start_offset = start_pos_cur / Comp_LEN;
      uint32_t probe_offset =
          Comp_LEN * ((start_offset + rank) & (GROUP_SIZE - 1));
      VecD_Comp probe_digests =
          *reinterpret_cast<VecD_Comp*>(digest_src + probe_offset);
      uint32_t cmp_result = __vcmpeq4(probe_digests, target_digests_cur);
      cmp_result &= 0x01010101;

      // b2 probe offset (from b2's independent hash).
      uint32_t start_offset2 = start_pos2_cur / Comp_LEN;
      uint32_t b2_probe_offset =
          Comp_LEN * ((start_offset2 + rank) & (GROUP_SIZE - 1));
      // Load b2 digests (synchronous read).
      D* b2_digests_ptr = BUCKET::digests(keys_ptr2_cur, BUCKET_SIZE, 0);
      VecD_Comp b2_probe_digests =
          *reinterpret_cast<VecD_Comp*>(b2_digests_ptr + b2_probe_offset);
      uint32_t b2_cmp = __vcmpeq4(b2_probe_digests, target_digests_cur);
      b2_cmp &= 0x01010101;

      // ============================================================
      // Phase 0: DUPLICATE detection in BOTH buckets
      // ============================================================

      // --- Phase 0a: DUPLICATE scan in b1 ---
      uint32_t possible_pos = 0;
      bool result = false;
      {
        uint32_t cmp_copy = cmp_result;
        do {
          if (cmp_copy == 0) break;
          int32_t index = (__ffs(cmp_copy) - 1) >> 3;
          cmp_copy &= (cmp_copy - 1);
          possible_pos = probe_offset + index;
          auto current_key = BUCKET::keys(keys_ptr_cur, possible_pos);
          K expected_key = key_cur;
          result = current_key->compare_exchange_strong(
              expected_key, static_cast<K>(LOCKED_KEY),
              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
        } while (!result);
      }

      uint32_t found_vote = g.ballot(result);
      if (found_vote) {
        // DUPLICATE found in b1 -> update in place.
        int32_t src_lane = __ffs(found_vote) - 1;
        possible_pos = g.shfl(possible_pos, src_lane);
        if (rank == i) {
          occupy_result = OccupyResult::DUPLICATE;
          key_pos = possible_pos;
          target_bucket = 1;
          S* sm_param_scores = SMM::param_scores(smem);
          // Note: desired_when_missed is intentionally used here for
          // DUPLICATE keys.  For kCustomized strategy the actual score
          // semantics are determined by update_with_digest, which
          // overwrites the score unconditionally.  The naming is
          // inherited from the single-bucket API and does not imply
          // "key was absent".
          S score = ScoreFunctor_::desired_when_missed(sm_param_scores, tx,
                                                       global_epoch);
          D digest = get_dual_bucket_digest<K>(key);
          ScoreFunctor_::update_with_digest(bucket_keys_ptr1, key_pos,
                                            sm_param_scores, tx, score,
                                            BUCKET_SIZE, digest, false);
        }
      }

      // --- Phase 0b: DUPLICATE scan in b2 (only if not found in b1) ---
      occupy_result_cur = g.shfl(occupy_result, i);
      if (occupy_result_cur == OccupyResult::INITIAL) {
        result = false;
        possible_pos = 0;
        {
          uint32_t cmp_copy = b2_cmp;
          do {
            if (cmp_copy == 0) break;
            int32_t index = (__ffs(cmp_copy) - 1) >> 3;
            cmp_copy &= (cmp_copy - 1);
            possible_pos = b2_probe_offset + index;
            auto current_key = BUCKET::keys(keys_ptr2_cur, possible_pos);
            K expected_key = key_cur;
            result = current_key->compare_exchange_strong(
                expected_key, static_cast<K>(LOCKED_KEY),
                cuda::std::memory_order_acquire,
                cuda::std::memory_order_relaxed);
          } while (!result);
        }

        found_vote = g.ballot(result);
        if (found_vote) {
          // DUPLICATE found in b2.
          int32_t src_lane = __ffs(found_vote) - 1;
          possible_pos = g.shfl(possible_pos, src_lane);
          if (rank == i) {
            occupy_result = OccupyResult::DUPLICATE;
            key_pos = possible_pos;
            target_bucket = 2;
            S* sm_param_scores = SMM::param_scores(smem);
            // See Phase 0a comment: desired_when_missed is used for
            // DUPLICATE keys; actual semantics governed by
            // update_with_digest.
            S score = ScoreFunctor_::desired_when_missed(sm_param_scores, tx,
                                                         global_epoch);
            D digest = get_dual_bucket_digest<K>(key);
            ScoreFunctor_::update_with_digest(bucket_keys_ptr2, key_pos,
                                              sm_param_scores, tx, score,
                                              BUCKET_SIZE, digest, false);
          }
        }
      }

      // ============================================================
      // Phase 1: D1 Two-Choice load-balanced EMPTY insertion
      // ============================================================
      occupy_result_cur = g.shfl(occupy_result, i);
      if (occupy_result_cur == OccupyResult::INITIAL) {
        auto bucket_size1 = *bucket_size_ptr1;
        auto bucket_size2 = *bsize_ptr2_cur;

        // True Two-Choice: prefer the emptier bucket.
        bool prefer_b1 = (bucket_size1 <= bucket_size2);

        // First bucket (emptier one).
        K* first_keys_ptr = prefer_b1 ? keys_ptr_cur : keys_ptr2_cur;
        int* first_bsize_ptr = prefer_b1 ? bucket_size_ptr1 : bsize_ptr2_cur;
        int first_size = prefer_b1 ? bucket_size1 : bucket_size2;
        VecD_Comp first_probe_digests =
            prefer_b1 ? probe_digests : b2_probe_digests;
        uint32_t first_probe_offset =
            prefer_b1 ? probe_offset : b2_probe_offset;
        int first_bucket_id = prefer_b1 ? 1 : 2;

        // Second bucket (fuller one).
        K* second_keys_ptr = prefer_b1 ? keys_ptr2_cur : keys_ptr_cur;
        int* second_bsize_ptr = prefer_b1 ? bsize_ptr2_cur : bucket_size_ptr1;
        int second_size = prefer_b1 ? bucket_size2 : bucket_size1;
        VecD_Comp second_probe_digests =
            prefer_b1 ? b2_probe_digests : probe_digests;
        uint32_t second_probe_offset =
            prefer_b1 ? b2_probe_offset : probe_offset;
        int second_bucket_id = prefer_b1 ? 2 : 1;

        // --- Try EMPTY in first (emptier) bucket ---
        if (first_size < BUCKET_SIZE) {
          VecD_Comp empty_digests_ = dual_bucket_empty_digests<K>();
          uint32_t empty_result =
              __vcmpeq4(first_probe_digests, empty_digests_);
          empty_result &= 0x01010101;
          result = false;
          possible_pos = 0;
          for (int32_t offset = 0; offset < GROUP_SIZE; offset += 1) {
            if (rank == offset) {
              do {
                if (empty_result == 0) break;
                int32_t index = (__ffs(empty_result) - 1) >> 3;
                empty_result &= (empty_result - 1);
                possible_pos = first_probe_offset + index;
                auto current_key = BUCKET::keys(first_keys_ptr, possible_pos);
                K expected_key = static_cast<K>(EMPTY_KEY);
                result = current_key->compare_exchange_strong(
                    expected_key, static_cast<K>(LOCKED_KEY),
                    cuda::std::memory_order_acquire,
                    cuda::std::memory_order_relaxed);
              } while (!result);
            }
            found_vote = g.ballot(result);
            if (found_vote) {
              int32_t src_lane = __ffs(found_vote) - 1;
              possible_pos = g.shfl(possible_pos, src_lane);
              if (rank == i) {
                occupy_result = OccupyResult::OCCUPIED_EMPTY;
                key_pos = possible_pos;
                target_bucket = first_bucket_id;
                S* sm_param_scores = SMM::param_scores(smem);
                S score = ScoreFunctor_::desired_when_missed(sm_param_scores,
                                                             tx, global_epoch);
                D digest = get_dual_bucket_digest<K>(key);
                K* target_keys = (first_bucket_id == 1) ? bucket_keys_ptr1
                                                        : bucket_keys_ptr2;
                ScoreFunctor_::update_with_digest(target_keys, key_pos,
                                                  sm_param_scores, tx, score,
                                                  BUCKET_SIZE, digest, true);
                atomicAdd(first_bsize_ptr, 1);
              }
              break;
            }
          }
        }

        // --- Try EMPTY in second (fuller) bucket (fallback) ---
        occupy_result_cur = g.shfl(occupy_result, i);
        if (occupy_result_cur == OccupyResult::INITIAL &&
            second_size < BUCKET_SIZE) {
          VecD_Comp empty_digests_ = dual_bucket_empty_digests<K>();
          uint32_t empty_result =
              __vcmpeq4(second_probe_digests, empty_digests_);
          empty_result &= 0x01010101;
          result = false;
          possible_pos = 0;
          for (int32_t offset = 0; offset < GROUP_SIZE; offset += 1) {
            if (rank == offset) {
              do {
                if (empty_result == 0) break;
                int32_t index = (__ffs(empty_result) - 1) >> 3;
                empty_result &= (empty_result - 1);
                possible_pos = second_probe_offset + index;
                auto current_key = BUCKET::keys(second_keys_ptr, possible_pos);
                K expected_key = static_cast<K>(EMPTY_KEY);
                result = current_key->compare_exchange_strong(
                    expected_key, static_cast<K>(LOCKED_KEY),
                    cuda::std::memory_order_acquire,
                    cuda::std::memory_order_relaxed);
              } while (!result);
            }
            found_vote = g.ballot(result);
            if (found_vote) {
              int32_t src_lane = __ffs(found_vote) - 1;
              possible_pos = g.shfl(possible_pos, src_lane);
              if (rank == i) {
                occupy_result = OccupyResult::OCCUPIED_EMPTY;
                key_pos = possible_pos;
                target_bucket = second_bucket_id;
                S* sm_param_scores = SMM::param_scores(smem);
                S score = ScoreFunctor_::desired_when_missed(sm_param_scores,
                                                             tx, global_epoch);
                D digest = get_dual_bucket_digest<K>(key);
                K* target_keys = (second_bucket_id == 1) ? bucket_keys_ptr1
                                                         : bucket_keys_ptr2;
                ScoreFunctor_::update_with_digest(target_keys, key_pos,
                                                  sm_param_scores, tx, score,
                                                  BUCKET_SIZE, digest, true);
                atomicAdd(second_bsize_ptr, 1);
              }
              break;
            }
          }
        }
      }

      // ============================================================
      // Phase 2: D2 Score Eviction (both buckets full)
      // ============================================================
      occupy_result_cur = g.shfl(occupy_result, i);
      if (occupy_result_cur == OccupyResult::INITIAL) {
        S* sm_param_scores = SMM::param_scores(smem);
        S score_cur = ScoreFunctor_::desired_when_missed(sm_param_scores,
                                                         tx_cur, global_epoch);

        S* b1_scores = BUCKET::scores(keys_ptr_cur, BUCKET_SIZE, 0);
        S* b2_scores = BUCKET::scores(keys_ptr2_cur, BUCKET_SIZE, 0);

        // Cache scores in per-thread registers for eviction retry.
        constexpr int SCORES_PER_THREAD =
            BUCKET_SIZE / (GROUP_SIZE * Load_LEN_S) * Load_LEN_S;
        S b1_cached[SCORES_PER_THREAD];
        int b1_pos_cached[SCORES_PER_THREAD];
        S b2_cached[SCORES_PER_THREAD];
        int b2_pos_cached[SCORES_PER_THREAD];
        {
          int idx = 0;
          for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {
            S tmp[Load_LEN_S];
            *reinterpret_cast<byte16*>(tmp) =
                *reinterpret_cast<byte16*>(b1_scores + rank * Load_LEN_S + j);
            for (int k = 0; k < Load_LEN_S; k++) {
              b1_cached[idx] = tmp[k];
              b1_pos_cached[idx] = rank * Load_LEN_S + j + k;
              idx++;
            }
          }
        }
        {
          int idx = 0;
          for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {
            S tmp[Load_LEN_S];
            *reinterpret_cast<byte16*>(tmp) =
                *reinterpret_cast<byte16*>(b2_scores + rank * Load_LEN_S + j);
            for (int k = 0; k < Load_LEN_S; k++) {
              b2_cached[idx] = tmp[k];
              b2_pos_cached[idx] = rank * Load_LEN_S + j + k;
              idx++;
            }
          }
        }

        // Eviction retry loop.
        while (true) {
          occupy_result_cur = g.shfl(occupy_result, i);
          if (occupy_result_cur != OccupyResult::INITIAL) break;

          // Find per-thread min for b1 and b2 from cached scores.
          S min_b1_local = static_cast<S>(MAX_SCORE);
          int min_b1_idx = -1;
          for (int s = 0; s < SCORES_PER_THREAD; s++) {
            if (b1_cached[s] < min_b1_local) {
              min_b1_local = b1_cached[s];
              min_b1_idx = s;
            }
          }
          S min_b2_local = static_cast<S>(MAX_SCORE);
          int min_b2_idx = -1;
          for (int s = 0; s < SCORES_PER_THREAD; s++) {
            if (b2_cached[s] < min_b2_local) {
              min_b2_local = b2_cached[s];
              min_b2_idx = s;
            }
          }

          S min_b1_global = cg::reduce(g, min_b1_local, cg::less<S>());
          S min_b2_global = cg::reduce(g, min_b2_local, cg::less<S>());
          S overall_min =
              (min_b1_global <= min_b2_global) ? min_b1_global : min_b2_global;

          // REFUSED: new score too low to evict anything.
          if (score_cur < overall_min) {
            if (rank == i) {
              occupy_result = OccupyResult::REFUSED;
            }
            break;
          }

          // Pick the bucket with lower min_score (Two-Choice eviction).
          bool use_b1 = (min_b1_global <= min_b2_global);
          S min_score_local = use_b1 ? min_b1_local : min_b2_local;
          int min_local_idx = use_b1 ? min_b1_idx : min_b2_idx;
          int min_pos_local = (min_local_idx >= 0)
                                  ? (use_b1 ? b1_pos_cached[min_local_idx]
                                            : b2_pos_cached[min_local_idx])
                                  : -1;
          S min_score_global = use_b1 ? min_b1_global : min_b2_global;
          K* evict_keys_ptr = use_b1 ? keys_ptr_cur : keys_ptr2_cur;
          int* evict_bsize_ptr = use_b1 ? bucket_size_ptr1 : bsize_ptr2_cur;

          uint32_t vote = g.ballot(min_score_local <= min_score_global);
          if (vote) {
            int src_lane = __ffs(vote) - 1;
            int min_pos_evict = g.shfl(min_pos_local, src_lane);

            // Mark this position as visited for the winning thread.
            if (use_b1) {
              int visited_idx = g.shfl(min_local_idx, src_lane);
              if (rank == src_lane && visited_idx >= 0)
                b1_cached[visited_idx] = static_cast<S>(MAX_SCORE);
            } else {
              int visited_idx = g.shfl(min_local_idx, src_lane);
              if (rank == src_lane && visited_idx >= 0)
                b2_cached[visited_idx] = static_cast<S>(MAX_SCORE);
            }

            if (rank == i) {
              auto min_score_key = BUCKET::keys(evict_keys_ptr, min_pos_evict);
              auto expected_key =
                  min_score_key->load(cuda::std::memory_order_relaxed);
              if (expected_key != static_cast<K>(LOCKED_KEY) &&
                  expected_key != static_cast<K>(EMPTY_KEY)) {
                bool cas_ok = min_score_key->compare_exchange_strong(
                    expected_key, static_cast<K>(LOCKED_KEY),
                    cuda::std::memory_order_acquire,
                    cuda::std::memory_order_relaxed);
                if (cas_ok) {
                  S* score_ptr = BUCKET::scores(evict_keys_ptr, BUCKET_SIZE,
                                                min_pos_evict);
                  auto verify_score_ptr =
                      reinterpret_cast<AtomicScore<S>*>(score_ptr);
                  auto verify_score =
                      verify_score_ptr->load(cuda::std::memory_order_relaxed);
                  if (verify_score <= min_score_global) {
                    if (expected_key == static_cast<K>(RECLAIM_KEY)) {
                      occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
                      atomicAdd(evict_bsize_ptr, 1);
                    } else {
                      occupy_result = OccupyResult::EVICT;
                    }
                    key_pos = min_pos_evict;
                    target_bucket = use_b1 ? 1 : 2;
                    K* target_keys_ptr =
                        use_b1 ? bucket_keys_ptr1 : bucket_keys_ptr2;
                    D digest = get_dual_bucket_digest<K>(key);
                    ScoreFunctor_::update_with_digest(
                        target_keys_ptr, key_pos, sm_param_scores, tx,
                        score_cur, BUCKET_SIZE, digest, true);
                  } else {
                    min_score_key->store(expected_key,
                                         cuda::std::memory_order_release);
                  }
                }
              }
            }
          } else {
            // No thread holds the minimum — all positions exhausted.
            if (rank == i) {
              occupy_result = OccupyResult::REFUSED;
            }
            break;
          }
        }  // while eviction retry
      }
    }  // end of INITIAL check

    // === Step 3: Prefetch values to shared memory for previous key ===
    if (i > 0) {
      auto occupy_result_prev = g.shfl(occupy_result, i - 1);
      if (occupy_result_prev != OccupyResult::ILLEGAL &&
          occupy_result_prev != OccupyResult::REFUSED) {
        VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(i), dim);
        auto kv_idx_cur = g.shfl(kv_idx, i - 1);
        const VecV* src = values + kv_idx_cur * dim;
        CopyValue::ldg_sts(rank, dst, src, dim);
      }
    }
    __pipeline_commit();

    // === Step 4: Write values for key (i-2) ===
    if (i > 1) {
      auto occupy_result_wb = g.shfl(occupy_result, i - 2);
      if (occupy_result_wb != OccupyResult::ILLEGAL &&
          occupy_result_wb != OccupyResult::REFUSED) {
        VecV* src = SMM::values_buffer(smem, groupID, same_buf(i), dim);
        auto key_pos_wb = g.shfl(key_pos, i - 2);
        auto target_bucket_wb = g.shfl(target_bucket, i - 2);

        // Get the correct values pointer for the target bucket.
        VecV* dst;
        if (target_bucket_wb == 1) {
          VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
          dst = sm_bucket_values_ptr[groupID * GROUP_SIZE + i - 2] +
                key_pos_wb * dim;
        } else {
          auto bv2 = g.shfl(bucket_values_ptr2, i - 2);
          dst = bv2 + key_pos_wb * dim;
        }
        __pipeline_wait_prior(3);
        CopyValue::lds_stg(rank, dst, src, dim);

        // Unlock key.
        if (rank == i - 2) {
          K* target_keys_ptr =
              (target_bucket == 1) ? bucket_keys_ptr1 : bucket_keys_ptr2;
          auto key_address = BUCKET::keys(target_keys_ptr, key_pos);
          key_address->store(key, cuda::std::memory_order_release);
        }
      }
    }
  }  // end main loop

  // =========== Pipeline draining ===========

  // Step 3 for last key (i = GROUP_SIZE - 1).
  {
    auto occupy_result_prev = g.shfl(occupy_result, GROUP_SIZE - 1);
    if (occupy_result_prev != OccupyResult::ILLEGAL &&
        occupy_result_prev != OccupyResult::REFUSED) {
      VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(GROUP_SIZE), dim);
      auto kv_idx_cur = g.shfl(kv_idx, GROUP_SIZE - 1);
      const VecV* src = values + kv_idx_cur * dim;
      CopyValue::ldg_sts(rank, dst, src, dim);
    }
  }
  __pipeline_commit();

  // Step 4 for key (GROUP_SIZE - 2).
  {
    auto occupy_result_wb = g.shfl(occupy_result, GROUP_SIZE - 2);
    if (occupy_result_wb != OccupyResult::ILLEGAL &&
        occupy_result_wb != OccupyResult::REFUSED) {
      VecV* src = SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE), dim);
      auto key_pos_wb = g.shfl(key_pos, GROUP_SIZE - 2);
      auto target_bucket_wb = g.shfl(target_bucket, GROUP_SIZE - 2);
      VecV* dst;
      if (target_bucket_wb == 1) {
        VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
        dst = sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 2] +
              key_pos_wb * dim;
      } else {
        auto bv2 = g.shfl(bucket_values_ptr2, GROUP_SIZE - 2);
        dst = bv2 + key_pos_wb * dim;
      }
      __pipeline_wait_prior(1);
      CopyValue::lds_stg(rank, dst, src, dim);
      if (rank == GROUP_SIZE - 2) {
        K* target_keys_ptr =
            (target_bucket == 1) ? bucket_keys_ptr1 : bucket_keys_ptr2;
        auto key_address = BUCKET::keys(target_keys_ptr, key_pos);
        key_address->store(key, cuda::std::memory_order_release);
      }
    }
  }

  // Step 4 for last key (GROUP_SIZE - 1).
  {
    auto occupy_result_wb = g.shfl(occupy_result, GROUP_SIZE - 1);
    if (occupy_result_wb != OccupyResult::ILLEGAL &&
        occupy_result_wb != OccupyResult::REFUSED) {
      VecV* src =
          SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE + 1), dim);
      auto key_pos_wb = g.shfl(key_pos, GROUP_SIZE - 1);
      auto target_bucket_wb = g.shfl(target_bucket, GROUP_SIZE - 1);
      VecV* dst;
      if (target_bucket_wb == 1) {
        VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
        dst = sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 1] +
              key_pos_wb * dim;
      } else {
        auto bv2 = g.shfl(bucket_values_ptr2, GROUP_SIZE - 1);
        dst = bv2 + key_pos_wb * dim;
      }
      __pipeline_wait_prior(0);
      CopyValue::lds_stg(rank, dst, src, dim);
      if (rank == GROUP_SIZE - 1) {
        K* target_keys_ptr =
            (target_bucket == 1) ? bucket_keys_ptr1 : bucket_keys_ptr2;
        auto key_address = BUCKET::keys(target_keys_ptr, key_pos);
        key_address->store(key, cuda::std::memory_order_release);
      }
    }
  }
}

// --- Kernel Launcher ---

template <typename K, typename V, typename S, typename VecV, int Strategy>
struct Launch_DualBucket_Pipeline_Upsert {
  using Params = Params_Upsert<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    constexpr uint32_t GROUP_SIZE = 32;
    constexpr uint32_t BUCKET_SIZE = 128;
    using SMM = SharedMemoryManager_Pipeline_Upsert<K, V, S, VecV, BLOCK_SIZE,
                                                    GROUP_SIZE, BUCKET_SIZE>;

    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    uint32_t shared_mem = SMM::total_size(params.dim);
    shared_mem =
        (shared_mem + sizeof(byte16) - 1) / sizeof(byte16) * sizeof(byte16);
    dual_bucket_pipeline_upsert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE,
                                               Strategy>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, shared_mem,
           stream>>>(params.buckets, params.buckets_size, params.buckets_num,
                     params.dim, params.keys,
                     reinterpret_cast<const VecV*>(params.values),
                     params.scores, params.n, params.global_epoch);
  }
};

// --- Kernel Selector ---

template <typename K, typename V, typename S, int Strategy, typename ArchTag>
struct KernelSelector_DualBucketUpsert {
  using Params = Params_Upsert<K, V, S>;

  static void select_kernel(Params& params, cudaStream_t& stream) {
    const uint32_t total_value_size =
        static_cast<uint32_t>(params.dim * sizeof(V));

    // Dual-bucket always uses pipeline kernel (optimized for bucket_size=128).
    if (total_value_size % sizeof(byte16) == 0) {
      using VecV = byte16;
      Launch_DualBucket_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(
          params, stream);
    } else if (total_value_size % sizeof(byte8) == 0) {
      using VecV = byte8;
      Launch_DualBucket_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(
          params, stream);
    } else if (total_value_size % sizeof(byte4) == 0) {
      using VecV = byte4;
      Launch_DualBucket_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(
          params, stream);
    } else if (total_value_size % sizeof(byte2) == 0) {
      using VecV = byte2;
      Launch_DualBucket_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(
          params, stream);
    } else {
      using VecV = byte;
      Launch_DualBucket_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(
          params, stream);
    }
  }
};

}  // namespace merlin
}  // namespace nv


================================================
FILE: include/merlin/core_kernels/dual_bucket_utils.cuh
================================================
/*
 * Copyright (c) 2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "kernel_utils.cuh"

namespace nv {
namespace merlin {

/**
 * Core dual-bucket index computation from a pre-computed hash.
 * b1 = low 32 bits mod buckets_num, b2 = high 32 bits mod buckets_num.
 * Guarantees b2 != b1 by advancing b2 on collision.
 *
 * This is the single source of truth for dual-bucket addressing.
 * All kernels (upsert, lookup, etc.) must use this function.
 */
template <class K>
__device__ __forceinline__ void get_dual_bucket_indices(
    const K hashed_key, const size_t buckets_num, size_t& bkt_idx1,
    size_t& bkt_idx2) {
  const uint32_t lo = static_cast<uint32_t>(hashed_key);
  const uint32_t hi =
      static_cast<uint32_t>(static_cast<uint64_t>(hashed_key) >> 32);

  bkt_idx1 = lo % buckets_num;
  bkt_idx2 = hi % buckets_num;
  if (bkt_idx2 == bkt_idx1) {
    bkt_idx2 = (bkt_idx2 + 1) % buckets_num;
  }
}

/**
 * Digest functions for dual-bucket mode.
 *
 * Dual-bucket digests use bits [56:63] (highest 8 bits) of the Murmur3 hash,
 * whereas single-bucket digests use bits [32:39].  The different bit range
 * avoids collision with the b2 bucket address, which is derived from the high
 * 32 bits (bits [32:63]).  Using [56:63] ensures that two keys mapping to the
 * same b2 bucket can still have distinct digests.
 *
 * INVARIANT: `dual_bucket_empty_digest()` must ALWAYS return the true
 * hash-derived value for EMPTY_KEY.  Kernels rely on this sentinel to
 * distinguish empty slots from occupied ones during the SIMD scan pass.
 * Returning a constant would cause every occupied slot to match the empty
 * digest, breaking the probing logic.
 */

// Target digest for a given key (bits [56:63] of Murmur3 hash).
template <class K>
__device__ __forceinline__ D get_dual_bucket_digest(const K& key) {
  const K hashed_key = Murmur3HashDevice(key);
  return static_cast<D>(static_cast<uint64_t>(hashed_key) >> 56);
}

// Target digest from a pre-computed hash.
template <class K>
__device__ __forceinline__ D
get_dual_bucket_digest_from_hash(const K& hashed_key) {
  return static_cast<D>(static_cast<uint64_t>(hashed_key) >> 56);
}

// Pack dual-bucket digest into all 4 bytes for SIMD `__vcmpeq4` comparison.
template <class K>
__device__ __forceinline__ VecD_Comp
dual_bucket_digests_from_hashed(const K& hashed_key) {
  D digest = static_cast<D>(static_cast<uint64_t>(hashed_key) >> 56);
  return static_cast<VecD_Comp>(__byte_perm(digest, digest, 0x0000));
}

// Sentinel digest for EMPTY_KEY in dual-bucket mode — must always use real
// hash value (bits [56:63]).
template <class K>
__device__ __forceinline__ D dual_bucket_empty_digest() {
  const K hashed_key = Murmur3HashDevice(static_cast<K>(EMPTY_KEY));
  return static_cast<D>(static_cast<uint64_t>(hashed_key) >> 56);
}

// Pack empty-key digest into all 4 bytes for SIMD comparison.
template <class K>
__device__ __forceinline__ VecD_Comp dual_bucket_empty_digests() {
  D digest = dual_bucket_empty_digest<K>();
  return static_cast<VecD_Comp>(__byte_perm(digest, digest, 0x0000));
}

}  // namespace merlin
}  // namespace nv


================================================
FILE: include/merlin/core_kernels/find_or_insert.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "kernel_utils.cuh"

namespace nv {
namespace merlin {

// Use 1 thread to deal with a KV-pair, including copying value.
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          typename VecV = byte16, uint32_t BLOCK_SIZE = 128, int Strategy = -1>
__global__ void tlp_v1_find_or_insert_kernel_with_io(
    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,
    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,
    const K* __restrict__ keys, VecV* __restrict__ values,
    S* __restrict__ scores, uint64_t n, const S global_epoch) {
  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, 1>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  // bucket_capacity is a multiple of 4.
  constexpr uint32_t STRIDE_S = 4;
  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);
  __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  S score{static_cast<S>(EMPTY_SCORE)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  VecV* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  int32_t* bucket_size_ptr{nullptr};
  uint32_t key_pos = {0};
  uint32_t bucket_size{0};
  if (kv_idx < n) {
    key = keys[kv_idx];
    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);

    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      bucket_size_ptr = buckets_size + bkt_idx;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_size = *bucket_size_ptr;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);
    } else {
      return;
    }
  } else {
    return;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::DUPLICATE;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), false);
        break;
      } else if (bucket_size == bucket_capacity) {
        continue;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = static_cast<K>(EMPTY_KEY);
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::OCCUPIED_EMPTY;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), true);

        atomicAdd(bucket_size_ptr, 1);
        break;
      }
    }
  }
  while (occupy_result == OccupyResult::INITIAL) {
    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);
    S min_score = static_cast<S>(MAX_SCORE);
    int min_pos = -1;
#pragma unroll
    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,
                              sizeof(S) * Load_LEN_S);
    }
    __pipeline_commit();
    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {
      if (i < bucket_capacity - STRIDE_S) {
#pragma unroll
        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
          __pipeline_memcpy_async(
              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,
              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);
        }
      }
      __pipeline_commit();
      __pipeline_wait_prior(1);
      S temp_scores[Load_LEN_S];
      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;
#pragma unroll
      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {
        *reinterpret_cast<byte16*>(temp_scores) =
            *reinterpret_cast<byte16*>(src + k);
#pragma unroll
        for (int j = 0; j < Load_LEN_S; j += 1) {
          S temp_score = temp_scores[j];
          if (temp_score < min_score) {
            min_score = temp_score;
            min_pos = i + k + j;
          }
        }
      }
    }
    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (score < min_score) {
      occupy_result = OccupyResult::REFUSED;
      break;
    }
    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);
    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);
    if (expected_key != static_cast<K>(LOCKED_KEY) &&
        expected_key != static_cast<K>(EMPTY_KEY)) {
      bool result = min_score_key->compare_exchange_strong(
          expected_key, static_cast<K>(LOCKED_KEY),
          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      if (result) {
        S* min_score_ptr =
            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);
        auto verify_score_ptr =
            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);
        auto verify_score =
            verify_score_ptr->load(cuda::std::memory_order_relaxed);
        if (verify_score <= min_score) {
          key_pos = min_pos;
          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                           kv_idx, score, bucket_capacity,
                                           get_digest<K>(key), true);

          if (expected_key == static_cast<K>(RECLAIM_KEY)) {
            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
            atomicAdd(bucket_size_ptr, 1);
          } else {
            occupy_result = OccupyResult::EVICT;
          }

        } else {
          min_score_key->store(expected_key, cuda::std::memory_order_release);
        }
      }
    }
  }
  VecV* bucket_value_ptr = bucket_values_ptr + key_pos * dim;
  VecV* param_value_ptr = values + kv_idx * dim;

  if (occupy_result != OccupyResult::REFUSED) {
    if (occupy_result == OccupyResult::DUPLICATE) {
      CopyValue::ldg_stg(0, param_value_ptr, bucket_value_ptr, dim);
    } else {
      CopyValue::ldg_stg(0, bucket_value_ptr, param_value_ptr, dim);
    }
    auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
    // memory_order_release:
    // Modifications to the bucket will not after this instruction.
    key_address->store(key, cuda::std::memory_order_release);
  }
}

// Use 1 thread to deal with a KV-pair, including copying value.
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          typename VecV = byte16, uint32_t BLOCK_SIZE = 128,
          uint32_t GROUP_SIZE = 16, int Strategy = -1>
__global__ void tlp_v2_find_or_insert_kernel_with_io(
    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,
    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,
    const K* __restrict__ keys, VecV* __restrict__ values,
    S* __restrict__ scores, uint64_t n, const S global_epoch) {
  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());

  // bucket_capacity is a multiple of 4.
  constexpr uint32_t STRIDE_S = 4;
  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);
  __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  S score{static_cast<S>(EMPTY_SCORE)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  VecV* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  int32_t* bucket_size_ptr{nullptr};
  uint32_t key_pos = {0};
  uint32_t bucket_size{0};
  if (kv_idx < n) {
    key = keys[kv_idx];
    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);

    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      bucket_size_ptr = buckets_size + bkt_idx;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_size = *bucket_size_ptr;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);
    } else {
      occupy_result = OccupyResult::ILLEGAL;
    }
  } else {
    occupy_result = OccupyResult::ILLEGAL;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::DUPLICATE;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), false);
        break;
      } else if (bucket_size == bucket_capacity) {
        continue;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = static_cast<K>(EMPTY_KEY);
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::OCCUPIED_EMPTY;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), true);
        atomicAdd(bucket_size_ptr, 1);
        break;
      }
    }
  }
  while (occupy_result == OccupyResult::INITIAL) {
    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);
    S min_score = static_cast<S>(MAX_SCORE);
    int min_pos = -1;
#pragma unroll
    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,
                              sizeof(S) * Load_LEN_S);
    }
    __pipeline_commit();
    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {
      if (i < bucket_capacity - STRIDE_S) {
#pragma unroll
        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
          __pipeline_memcpy_async(
              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,
              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);
        }
      }
      __pipeline_commit();
      __pipeline_wait_prior(1);
      S temp_scores[Load_LEN_S];
      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;
#pragma unroll
      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {
        *reinterpret_cast<byte16*>(temp_scores) =
            *reinterpret_cast<byte16*>(src + k);
#pragma unroll
        for (int j = 0; j < Load_LEN_S; j += 1) {
          S temp_score = temp_scores[j];
          if (temp_score < min_score) {
            auto verify_key_ptr = BUCKET::keys(bucket_keys_ptr, i + k + j);
            auto verify_key =
                verify_key_ptr->load(cuda::std::memory_order_relaxed);
            if (verify_key != static_cast<K>(LOCKED_KEY) &&
                verify_key != static_cast<K>(EMPTY_KEY)) {
              min_score = temp_score;
              min_pos = i + k + j;
            }
          }
        }
      }
    }

    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (score < min_score) {
      occupy_result = OccupyResult::REFUSED;
      break;
    }
    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);
    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);
    if (expected_key != static_cast<K>(LOCKED_KEY) &&
        expected_key != static_cast<K>(EMPTY_KEY)) {
      bool result = min_score_key->compare_exchange_strong(
          expected_key, static_cast<K>(LOCKED_KEY),
          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      if (result) {
        S* min_score_ptr =
            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);
        auto verify_score_ptr =
            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);
        auto verify_score =
            verify_score_ptr->load(cuda::std::memory_order_relaxed);
        if (verify_score <= min_score) {
          key_pos = min_pos;
          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                           kv_idx, score, bucket_capacity,
                                           get_digest<K>(key), true);

          if (expected_key == static_cast<K>(RECLAIM_KEY)) {
            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
            atomicAdd(bucket_size_ptr, 1);
          } else {
            occupy_result = OccupyResult::EVICT;
          }
        } else {
          min_score_key->store(expected_key, cuda::std::memory_order_release);
        }
      }
    }
  }

  VecV* bucket_value_ptr{nullptr};
  if ((occupy_result != OccupyResult::ILLEGAL) &&
      (occupy_result != OccupyResult::REFUSED)) {
    bucket_value_ptr = bucket_values_ptr + key_pos * dim;
  }
  __syncthreads();
  uint32_t rank = g.thread_rank();
  uint32_t groupID = threadIdx.x / GROUP_SIZE;

  // Shared memory reuse:
  // __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];
  // __shared__ VecV sm_values_buffer[GROUP_NUM][2][GROUP_BUF];
  // assert(GROUP_BUF >= 2 * dim);
  constexpr uint32_t GROUP_BUFs =
      GROUP_SIZE * 2 * STRIDE_S * sizeof(S) / sizeof(VecV);
  constexpr uint32_t GROUP_BUF = GROUP_BUFs / 2;
  auto sm_values_buffer =
      reinterpret_cast<VecV*>(&(sm_bucket_scores[0][0])) + groupID * GROUP_BUFs;

  auto occupy_result_next = g.shfl(occupy_result, 0);
  if ((occupy_result_next != OccupyResult::ILLEGAL) &&
      (occupy_result_next != OccupyResult::REFUSED)) {
    VecV* dst = sm_values_buffer;
    if (occupy_result_next == OccupyResult::DUPLICATE) {
      const VecV* src = g.shfl(bucket_value_ptr, 0);
      CopyValue::ldg_sts(rank, dst, src, dim);
    } else {
      auto kv_idx_next = g.shfl(kv_idx, 0);
      const VecV* src = values + kv_idx_next * dim;
      CopyValue::ldg_sts(rank, dst, src, dim);
    }
  }
  __pipeline_commit();

  for (int i = 0; i < GROUP_SIZE; i++) {
    if (i + 1 < GROUP_SIZE) {
      auto occupy_result_next = g.shfl(occupy_result, i + 1);
      if ((occupy_result_next != OccupyResult::ILLEGAL) &&
          (occupy_result_next != OccupyResult::REFUSED)) {
        VecV* dst = sm_values_buffer + diff_buf(i) * GROUP_BUF;
        if (occupy_result_next == OccupyResult::DUPLICATE) {
          const VecV* src = g.shfl(bucket_value_ptr, i + 1);
          CopyValue::ldg_sts(rank, dst, src, dim);
        } else {
          auto kv_idx_next = g.shfl(kv_idx, i + 1);
          const VecV* src = values + kv_idx_next * dim;
          CopyValue::ldg_sts(rank, dst, src, dim);
        }
      }
    }
    __pipeline_commit();
    auto occupy_result_cur = g.shfl(occupy_result, i);
    if ((occupy_result_cur != OccupyResult::ILLEGAL) &&
        (occupy_result_cur != OccupyResult::REFUSED)) {
      VecV* src = sm_values_buffer + same_buf(i) * GROUP_BUF;
      __pipeline_wait_prior(0);
      if (occupy_result_cur == OccupyResult::DUPLICATE) {
        auto kv_idx_cur = g.shfl(kv_idx, i);
        VecV* dst = values + kv_idx_cur * dim;
        __pipeline_wait_prior(1);
        CopyValue::lds_stg(rank, dst, src, dim);
      } else {
        VecV* dst = g.shfl(bucket_value_ptr, i);
        __pipeline_wait_prior(1);
        CopyValue::lds_stg(rank, dst, src, dim);
      }
    }
  }

  if ((occupy_result != OccupyResult::ILLEGAL) &&
      (occupy_result != OccupyResult::REFUSED)) {
    auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
    // memory_order_release:
    // Modifications to the bucket will not after this instruction.
    key_address->store(key, cuda::std::memory_order_release);
  }
}

template <
    typename K, typename V, typename S, typename VecV, uint32_t BLOCK_SIZE,
    uint32_t GROUP_SIZE, uint32_t BUCKET_SIZE,
    uint32_t GROUP_NUM = BLOCK_SIZE / GROUP_SIZE, uint32_t OFST_ParamScores = 0,
    uint32_t OFST_BucketValuesPtr = OFST_ParamScores + sizeof(S) * BLOCK_SIZE,
    uint32_t OFST_BucketsSizePtr =
        OFST_BucketValuesPtr + sizeof(VecV*) * BLOCK_SIZE,
    uint32_t OFST_BucketDigests =
        OFST_BucketsSizePtr + sizeof(int*) * BLOCK_SIZE,
    uint32_t OFST_BucketScores =
        OFST_BucketDigests + sizeof(D) * GROUP_NUM * 2 * BUCKET_SIZE,
    uint32_t OFST_BucketValues =
        OFST_BucketScores + sizeof(S) * GROUP_NUM * 2 * BUCKET_SIZE>
struct SharedMemoryManager_Pipeline_FindOrInsert {
  /*
    __shared__ S sm_param_scores[BLOCK_SIZE];
    __shared__ VecV* sm_bucket_values_ptr[BLOCK_SIZE];
    __shared__ int* sm_buckets_size_ptr[BLOCK_SIZE];
    __shared__ D sm_bucket_digests[GROUP_NUM][2][BUCKET_SIZE];
    __shared__ S sm_bucket_scores[GROUP_NUM][2][BUCKET_SIZE];
    __shared__ VecV sm_values_buffer[GROUP_NUM][2][dim];
  */
  static inline uint32_t total_size(uint32_t dim) {
    return BLOCK_SIZE * (sizeof(S) + sizeof(VecV*) + sizeof(int*)) +
           GROUP_NUM * 2 *
               (BUCKET_SIZE * (sizeof(D) + sizeof(S)) + dim * sizeof(VecV));
  }
  static __forceinline__ __device__ S* param_scores(byte* smem) {
    return reinterpret_cast<S*>(smem + OFST_ParamScores);
  }
  static __forceinline__ __device__ VecV** bucket_values_ptr(byte* smem) {
    return reinterpret_cast<VecV**>(smem + OFST_BucketValuesPtr);
  }
  static __forceinline__ __device__ int** buckets_size_ptr(byte* smem) {
    return reinterpret_cast<int**>(smem + OFST_BucketsSizePtr);
  }
  static __forceinline__ __device__ D* bucket_digests(byte* smem,
                                                      uint32_t groupID,
                                                      uint32_t buf) {
    return reinterpret_cast<D*>(smem + OFST_BucketDigests) +
           BUCKET_SIZE * (groupID * 2 + buf);
  }
  static __forceinline__ __device__ S* bucket_scores(byte* smem,
                                                     uint32_t groupID,
                                                     uint32_t buf) {
    return reinterpret_cast<S*>(smem + OFST_BucketScores) +
           BUCKET_SIZE * (groupID * 2 + buf);
  }
  static __forceinline__ __device__ VecV* values_buffer(byte* smem,
                                                        uint32_t groupID,
                                                        uint32_t buf,
                                                        uint32_t dim) {
    return reinterpret_cast<VecV*>(smem + OFST_BucketValues) +
           dim * (groupID * 2 + buf);
  }
};

template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          typename VecV = byte16, uint32_t BLOCK_SIZE = 128, int Strategy = -1>
__global__ void pipeline_find_or_insert_kernel_with_io(
    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,
    const uint64_t buckets_num, const uint32_t dim, const K* __restrict__ keys,
    VecV* __restrict__ values, S* __restrict__ scores, uint64_t n,
    const S global_epoch) {
  // Here, GROUP_SIZE * Comp_LEN = BUCKET_SIZE.
  constexpr uint32_t BUCKET_SIZE = 128;
  constexpr uint32_t GROUP_SIZE = 32;
  constexpr uint32_t Comp_LEN = sizeof(VecD_Comp) / sizeof(D);
  constexpr uint32_t Load_LEN = sizeof(VecD_Load) / sizeof(D);
  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);

  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;
  using SMM =
      SharedMemoryManager_Pipeline_FindOrInsert<K, V, S, VecV, BLOCK_SIZE,
                                                GROUP_SIZE, BUCKET_SIZE>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  __shared__ extern __align__(alignof(byte16)) byte smem[];

  // Initialization.
  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());
  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  VecD_Comp target_digests;
  K* bucket_keys_ptr{nullptr};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  uint32_t key_pos = 0;
  if (kv_idx < n) {
    key = keys[kv_idx];
    if (scores != nullptr) {
      S* sm_param_scores = SMM::param_scores(smem);
      __pipeline_memcpy_async(sm_param_scores + tx, scores + kv_idx, sizeof(S));
    }
    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * BUCKET_SIZE));
      uint64_t bkt_idx = global_idx / BUCKET_SIZE;
      key_pos = get_start_position(global_idx, BUCKET_SIZE);
      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
      sm_buckets_size_ptr[tx] = buckets_size + bkt_idx;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
      __pipeline_memcpy_async(sm_bucket_values_ptr + tx, &(bucket->vectors),
                              sizeof(VecV*));
    } else {
      occupy_result = OccupyResult::ILLEGAL;
    }
  } else {
    occupy_result = OccupyResult::ILLEGAL;
  }

  uint32_t rank = g.thread_rank();
  uint32_t groupID = threadIdx.x / GROUP_SIZE;

  // Pipeline loading.
  auto occupy_result_next = g.shfl(occupy_result, 0);
  auto keys_ptr_next = g.shfl(bucket_keys_ptr, 0);
  if (occupy_result_next == OccupyResult::INITIAL) {
    D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, 0);
    D* dst = sm_bucket_digests + rank * Load_LEN;
    D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);
    if (rank * Load_LEN < BUCKET_SIZE) {
      __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));
    }
  }
  __pipeline_commit();
  // Padding, meet the param of the first `__pipeline_wait_prior`
  // in the first loop.
  __pipeline_commit();
  __pipeline_commit();
  for (int32_t i = 0; i < GROUP_SIZE; i++) {
    // Step1: load digests from global memory to shared memory.
    if (i + 1 < GROUP_SIZE) {
      auto occupy_result_next = g.shfl(occupy_result, i + 1);
      auto keys_ptr_next = g.shfl(bucket_keys_ptr, i + 1);
      if (occupy_result_next == OccupyResult::INITIAL) {
        D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, diff_buf(i));
        D* dst = sm_bucket_digests + rank * Load_LEN;
        D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);
        if (rank * Load_LEN < BUCKET_SIZE) {
          __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));
        }
      }
    }
    __pipeline_commit();
    // Step2: to lock the target_key or empty_key by querying digests.
    auto occupy_result_cur = g.shfl(occupy_result, i);
    if (occupy_result_cur == OccupyResult::INITIAL) {
      uint32_t tx_cur = groupID * GROUP_SIZE + i;
      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
      auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];
      K key_cur = g.shfl(key, i);
      auto target_digests_cur = g.shfl(target_digests, i);
      auto start_pos_cur = g.shfl(key_pos, i);
      auto keys_ptr_cur = g.shfl(bucket_keys_ptr, i);
      auto bucket_size_cur = *bucket_size_ptr;
      __pipeline_wait_prior(3);
      D* src = SMM::bucket_digests(smem, groupID, same_buf(i));
      uint32_t start_offset = start_pos_cur / Comp_LEN;
      uint32_t probe_offset =
          Comp_LEN * ((start_offset + rank) & (GROUP_SIZE - 1));
      VecD_Comp probe_digests =
          *reinterpret_cast<VecD_Comp*>(src + probe_offset);
      uint32_t cmp_result = __vcmpeq4(probe_digests, target_digests_cur);
      cmp_result &= 0x01010101;
      uint32_t possible_pos = 0;
      bool result = false;
      do {
        if (cmp_result == 0) break;
        int32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = probe_offset + index;
        auto current_key = BUCKET::keys(keys_ptr_cur, possible_pos);
        K expected_key = key_cur;
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      uint32_t found_vote = g.ballot(result);
      if (found_vote) {
        int32_t src_lane = __ffs(found_vote) - 1;
        possible_pos = g.shfl(possible_pos, src_lane);
        if (rank == i) {
          occupy_result = OccupyResult::DUPLICATE;
          key_pos = possible_pos;
          S* sm_param_scores = SMM::param_scores(smem);
          S score = ScoreFunctor::desired_when_missed(sm_param_scores, tx,
                                                      global_epoch);
          ScoreFunctor::update_with_digest(
              bucket_keys_ptr, key_pos, sm_param_scores, tx, score, BUCKET_SIZE,
              get_digest<K>(key), false);
        }
      } else if (bucket_size_cur < BUCKET_SIZE) {
        VecD_Comp empty_digests_ = empty_digests<K>();
        cmp_result = __vcmpeq4(probe_digests, empty_digests_);
        cmp_result &= 0x01010101;
        for (int32_t offset = 0; offset < GROUP_SIZE; offset += 1) {
          if (rank == offset) {
            do {
              if (cmp_result == 0) break;
              int32_t index = (__ffs(cmp_result) - 1) >> 3;
              cmp_result &= (cmp_result - 1);
              possible_pos = probe_offset + index;
              auto current_key = BUCKET::keys(keys_ptr_cur, possible_pos);
              K expected_key = static_cast<K>(EMPTY_KEY);
              result = current_key->compare_exchange_strong(
                  expected_key, static_cast<K>(LOCKED_KEY),
                  cuda::std::memory_order_acquire,
                  cuda::std::memory_order_relaxed);
            } while (!result);
          }
          uint32_t found_vote = g.ballot(result);
          if (found_vote) {
            int32_t src_lane = __ffs(found_vote) - 1;
            possible_pos = g.shfl(possible_pos, src_lane);
            if (rank == i) {
              occupy_result = OccupyResult::OCCUPIED_EMPTY;
              S* sm_param_scores = SMM::param_scores(smem);
              S score = ScoreFunctor::desired_when_missed(sm_param_scores, tx,
                                                          global_epoch);
              key_pos = possible_pos;
              ScoreFunctor::update_with_digest(
                  bucket_keys_ptr, key_pos, sm_param_scores, tx, score,
                  BUCKET_SIZE, get_digest<K>(key), true);
              atomicAdd(bucket_size_ptr, 1);
            }
            break;
          }
        }
      }
      occupy_result_cur = g.shfl(occupy_result, i);
      if (occupy_result_cur == OccupyResult::INITIAL) {
        S* sm_bucket_scores = SMM::bucket_scores(smem, groupID, same_buf(i));
        S* dst = sm_bucket_scores + rank * Load_LEN_S;
        S* src = BUCKET::scores(keys_ptr_cur, BUCKET_SIZE, rank * Load_LEN_S);
#pragma unroll
        for (int32_t k = 0; k < BUCKET_SIZE; k += GROUP_SIZE * Load_LEN_S) {
          __pipeline_memcpy_async(dst + k, src + k, sizeof(S) * Load_LEN_S);
        }
      }
    }
    __pipeline_commit();
    // Step 3: reduce to get the key with the minimum score.
    if (i > 0) {
      occupy_result_cur = g.shfl(occupy_result, i - 1);
      uint32_t tx_cur = groupID * GROUP_SIZE + i - 1;
      S* sm_param_scores = SMM::param_scores(smem);
      S score_cur = ScoreFunctor::desired_when_missed(sm_param_scores, tx_cur,
                                                      global_epoch);

      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
      auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];
      __pipeline_wait_prior(3);
      S* src = SMM::bucket_scores(smem, groupID, diff_buf(i));
      while (occupy_result_cur == OccupyResult::INITIAL) {
        int min_pos_local = -1;
        S min_score_local = static_cast<S>(MAX_SCORE);
#pragma unroll
        for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {
          S temp_scores[Load_LEN_S];
          *reinterpret_cast<byte16*>(temp_scores) =
              *reinterpret_cast<byte16*>(src + rank * Load_LEN_S + j);
#pragma unroll
          for (int k = 0; k < Load_LEN_S; k++) {
            S temp_score = temp_scores[k];
            if (temp_score < min_score_local) {
              min_score_local = temp_score;
              min_pos_local = rank * Load_LEN_S + j + k;
            }
          }
        }
        const S min_score_global =
            cg::reduce(g, min_score_local, cg::less<S>());
        if (score_cur < min_score_global) {
          if (rank == i - 1) {
            occupy_result = OccupyResult::REFUSED;
          }
          occupy_result_cur = g.shfl(occupy_result, i - 1);
          break;
        }
        uint32_t vote = g.ballot(min_score_local <= min_score_global);
        if (vote) {
          int src_lane = __ffs(vote) - 1;
          int min_pos_global = g.shfl(min_pos_local, src_lane);
          if (rank == i - 1) {
            src[min_pos_global] = static_cast<S>(MAX_SCORE);  // Mark visited.
            auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos_global);
            auto expected_key =
                min_score_key->load(cuda::std::memory_order_relaxed);
            if (expected_key != static_cast<K>(LOCKED_KEY) &&
                expected_key != static_cast<K>(EMPTY_KEY)) {
              bool result = min_score_key->compare_exchange_strong(
                  expected_key, static_cast<K>(LOCKED_KEY),
                  cuda::std::memory_order_acquire,
                  cuda::std::memory_order_relaxed);
              if (result) {
                S* score_ptr = BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE,
                                              min_pos_global);
                auto verify_score_ptr =
                    reinterpret_cast<AtomicScore<S>*>(score_ptr);
                auto verify_score =
                    verify_score_ptr->load(cuda::std::memory_order_relaxed);
                if (verify_score <= min_score_global) {
                  if (expected_key == static_cast<K>(RECLAIM_KEY)) {
                    occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
                    atomicAdd(bucket_size_ptr, 1);
                  } else {
                    occupy_result = OccupyResult::EVICT;
                  }
                  key_pos = min_pos_global;
                  ScoreFunctor::update_with_digest(
                      bucket_keys_ptr, key_pos, sm_param_scores, tx_cur,
                      score_cur, BUCKET_SIZE, get_digest<K>(key), true);
                } else {
                  min_score_key->store(expected_key,
                                       cuda::std::memory_order_release);
                }
              }
            }
          }
          occupy_result_cur = g.shfl(occupy_result, i - 1);
        }
      }
      // Prefetch values to shared memory.
      if (occupy_result_cur != OccupyResult::ILLEGAL &&
          occupy_result_cur != OccupyResult::REFUSED) {
        VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(i), dim);
        if (occupy_result_cur == OccupyResult::DUPLICATE) {
          VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
          auto bucket_values_ptr =
              sm_bucket_values_ptr[groupID * GROUP_SIZE + i - 1];
          auto key_pos_cur = g.shfl(key_pos, i - 1);
          const VecV* src = bucket_values_ptr + key_pos_cur * dim;
          CopyValue::ldg_sts(rank, dst, src, dim);
        } else {
          auto kv_idx_cur = g.shfl(kv_idx, i - 1);
          const VecV* src = values + kv_idx_cur * dim;
          CopyValue::ldg_sts(rank, dst, src, dim);
        }
      }
    }
    __pipeline_commit();

    // Step 4: write values to bucket or param buffer.
    if (i > 1) {
      occupy_result_cur = g.shfl(occupy_result, i - 2);
      if (occupy_result_cur != OccupyResult::ILLEGAL &&
          occupy_result_cur != OccupyResult::REFUSED) {
        VecV* src = SMM::values_buffer(smem, groupID, same_buf(i), dim);
        if (occupy_result_cur == OccupyResult::DUPLICATE) {
          uint32_t kv_idx_cur = g.shfl(kv_idx, i - 2);
          VecV* dst = values + kv_idx_cur * dim;
          __pipeline_wait_prior(3);
          CopyValue::lds_stg(rank, dst, src, dim);
        } else {
          VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
          auto bucket_values_ptr =
              sm_bucket_values_ptr[groupID * GROUP_SIZE + i - 2];
          auto key_pos_cur = g.shfl(key_pos, i - 2);
          VecV* dst = bucket_values_ptr + key_pos_cur * dim;
          __pipeline_wait_prior(3);
          CopyValue::lds_stg(rank, dst, src, dim);
        }
        if (rank == i - 2) {
          auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
          key_address->store(key, cuda::std::memory_order_release);
        }
      }
    }
  }
  auto occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);
  uint32_t tx_cur = groupID * GROUP_SIZE + GROUP_SIZE - 1;
  S* sm_param_scores = SMM::param_scores(smem);
  S score_cur =
      ScoreFunctor::desired_when_missed(sm_param_scores, tx_cur, global_epoch);

  int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
  auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];
  __pipeline_wait_prior(1);
  S* src = SMM::bucket_scores(smem, groupID, diff_buf(GROUP_SIZE));
  while (occupy_result_cur == OccupyResult::INITIAL) {
    int min_pos_local = -1;
    S min_score_local = MAX_SCORE;
#pragma unroll
    for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {
      S temp_scores[Load_LEN_S];
      *reinterpret_cast<byte16*>(temp_scores) =
          *reinterpret_cast<byte16*>(src + rank * Load_LEN_S + j);
#pragma unroll
      for (int k = 0; k < Load_LEN_S; k++) {
        S temp_score = temp_scores[k];
        if (temp_score < min_score_local) {
          min_score_local = temp_score;
          min_pos_local = rank * Load_LEN_S + j + k;
        }
      }
    }
    const S min_score_global = cg::reduce(g, min_score_local, cg::less<S>());
    if (score_cur < min_score_global) {
      if (rank == GROUP_SIZE - 1) {
        occupy_result = OccupyResult::REFUSED;
      }
      occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);
      break;
    }
    uint32_t vote = g.ballot(min_score_local <= min_score_global);
    if (vote) {
      int src_lane = __ffs(vote) - 1;
      int min_pos_global = g.shfl(min_pos_local, src_lane);
      if (rank == GROUP_SIZE - 1) {
        src[min_pos_global] = static_cast<S>(MAX_SCORE);  // Mark visited.
        auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos_global);
        auto expected_key =
            min_score_key->load(cuda::std::memory_order_relaxed);
        if (expected_key != static_cast<K>(LOCKED_KEY) &&
            expected_key != static_cast<K>(EMPTY_KEY)) {
          auto min_score_ptr =
              BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE, min_pos_global);
          bool result = min_score_key->compare_exchange_strong(
              expected_key, static_cast<K>(LOCKED_KEY),
              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
          if (result) {
            S* score_ptr =
                BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE, min_pos_global);
            auto verify_score_ptr =
                reinterpret_cast<AtomicScore<S>*>(score_ptr);
            auto verify_score =
                verify_score_ptr->load(cuda::std::memory_order_relaxed);
            if (verify_score <= min_score_global) {
              if (expected_key == static_cast<K>(RECLAIM_KEY)) {
                atomicAdd(bucket_size_ptr, 1);
                occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
              } else {
                occupy_result = OccupyResult::EVICT;
              }
              key_pos = min_pos_global;
              ScoreFunctor::update_with_digest(
                  bucket_keys_ptr, key_pos, sm_param_scores, tx_cur, score_cur,
                  BUCKET_SIZE, get_digest<K>(key), true);
            } else {
              min_score_key->store(expected_key,
                                   cuda::std::memory_order_release);
            }
          }
        }
      }
      occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);
    }
  }
  // Prefetch values to shared memory.
  if (occupy_result_cur != OccupyResult::ILLEGAL &&
      occupy_result_cur != OccupyResult::REFUSED) {
    VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(GROUP_SIZE), dim);
    if (occupy_result_cur == OccupyResult::DUPLICATE) {
      VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
      auto bucket_values_ptr =
          sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 1];
      auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 1);
      const VecV* src = bucket_values_ptr + key_pos_cur * dim;
      CopyValue::ldg_sts(rank, dst, src, dim);
    } else {
      auto kv_idx_cur = g.shfl(kv_idx, GROUP_SIZE - 1);
      const VecV* src = values + kv_idx_cur * dim;
      CopyValue::ldg_sts(rank, dst, src, dim);
    }
  }
  __pipeline_commit();

  // Step 4: write values to bucket or param buffer.
  occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 2);
  if (occupy_result_cur != OccupyResult::ILLEGAL &&
      occupy_result_cur != OccupyResult::REFUSED) {
    VecV* src = SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE), dim);
    if (occupy_result_cur == OccupyResult::DUPLICATE) {
      uint32_t kv_idx_cur = g.shfl(kv_idx, GROUP_SIZE - 2);
      VecV* dst = values + kv_idx_cur * dim;
      __pipeline_wait_prior(1);
      CopyValue::lds_stg(rank, dst, src, dim);
    } else {
      VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
      auto bucket_values_ptr =
          sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 2];
      auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 2);
      VecV* dst = bucket_values_ptr + key_pos_cur * dim;
      __pipeline_wait_prior(1);
      CopyValue::lds_stg(rank, dst, src, dim);
    }
    if (rank == GROUP_SIZE - 2) {
      auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
      key_address->store(key, cuda::std::memory_order_release);
    }
  }

  // Step 4: write values to bucket or param buffer.
  occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);
  if (occupy_result_cur != OccupyResult::ILLEGAL &&
      occupy_result_cur != OccupyResult::REFUSED) {
    VecV* src =
        SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE + 1), dim);
    if (occupy_result_cur == OccupyResult::DUPLICATE) {
      uint32_t kv_idx_cur = g.shfl(kv_idx, GROUP_SIZE - 1);
      VecV* dst = values + kv_idx_cur * dim;
      __pipeline_wait_prior(0);
      CopyValue::lds_stg(rank, dst, src, dim);
    } else {
      VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
      auto bucket_values_ptr =
          sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 1];
      auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 1);
      VecV* dst = bucket_values_ptr + key_pos_cur * dim;
      __pipeline_wait_prior(0);
      CopyValue::lds_stg(rank, dst, src, dim);
    }
    if (rank == GROUP_SIZE - 1) {
      auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
      key_address->store(key, cuda::std::memory_order_release);
    }
  }
}

template <typename K = uint64_t, typename V = float, typename S = uint64_t>
struct Params_FindOrInsert {
  Params_FindOrInsert(float load_factor_,
                      Bucket<K, V, S>* __restrict__ buckets_,
                      int* buckets_size_, size_t buckets_num_,
                      uint32_t bucket_capacity_, uint32_t dim_,
                      const K* __restrict__ keys_, V* __restrict__ values_,
                      S* __restrict__ scores_, size_t n_, const S global_epoch_)
      : load_factor(load_factor_),
        buckets(buckets_),
        buckets_size(buckets_size_),
        buckets_num(buckets_num_),
        bucket_capacity(bucket_capacity_),
        dim(dim_),
        keys(keys_),
        values(values_),
        scores(scores_),
        n(n_),
        global_epoch(global_epoch_) {}
  float load_factor;
  Bucket<K, V, S>* __restrict__ buckets;
  int* buckets_size;
  size_t buckets_num;
  uint32_t bucket_capacity;
  uint32_t dim;
  const K* __restrict__ keys;
  V* __restrict__ values;
  S* __restrict__ scores;
  uint64_t n;
  const S global_epoch;
};

template <typename K, typename V, typename S, typename VecV, int Strategy>
struct Launch_TLPv1_FindOrInsert {
  using Params = Params_FindOrInsert<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    tlp_v1_find_or_insert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, Strategy>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
            params.buckets, params.buckets_size, params.buckets_num,
            params.bucket_capacity, params.dim, params.keys,
            reinterpret_cast<VecV*>(params.values), params.scores, params.n,
            params.global_epoch);
  }
};

template <typename K, typename V, typename S, typename VecV, int Strategy>
struct Launch_TLPv2_FindOrInsert {
  using Params = Params_FindOrInsert<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    const uint32_t value_size = params.dim * sizeof(V);
    params.dim = value_size / sizeof(VecV);

    if (value_size <= 256) {
      constexpr int GROUP_SIZE = 8;
      tlp_v2_find_or_insert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE,
                                           GROUP_SIZE, Strategy>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, params.buckets_size, params.buckets_num,
              params.bucket_capacity, params.dim, params.keys,
              reinterpret_cast<VecV*>(params.values), params.scores, params.n,
              params.global_epoch);
    } else {
      constexpr int GROUP_SIZE = 16;
      tlp_v2_find_or_insert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE,
                                           GROUP_SIZE, Strategy>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, params.buckets_size, params.buckets_num,
              params.bucket_capacity, params.dim, params.keys,
              reinterpret_cast<VecV*>(params.values), params.scores, params.n,
              params.global_epoch);
    }
  }
};

template <typename K, typename V, typename S, typename VecV, int Strategy>
struct Launch_Pipeline_FindOrInsert {
  using Params = Params_FindOrInsert<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    constexpr uint32_t GROUP_SIZE = 32;
    constexpr uint32_t BUCKET_SIZE = 128;
    using SMM =
        SharedMemoryManager_Pipeline_FindOrInsert<K, V, S, VecV, BLOCK_SIZE,
                                                  GROUP_SIZE, BUCKET_SIZE>;

    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    uint32_t shared_mem = SMM::total_size(params.dim);
    shared_mem =
        (shared_mem + sizeof(byte16) - 1) / sizeof(byte16) * sizeof(byte16);
    pipeline_find_or_insert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, Strategy>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, shared_mem,
           stream>>>(params.buckets, params.buckets_size, params.buckets_num,
                     params.dim, params.keys,
                     reinterpret_cast<VecV*>(params.values), params.scores,
                     params.n, params.global_epoch);
  }
};

template <typename ArchTag>
struct ValueConfig_FindOrInsert;

template <>
struct ValueConfig_FindOrInsert<Sm80> {
  // Value size greater than it will bring poor performance for TLPv1.
  static constexpr uint32_t size_tlp_v1 = 16 * sizeof(byte4);
  static constexpr uint32_t size_tlp_v2 = 128 * sizeof(byte4);
};

template <>
struct ValueConfig_FindOrInsert<Sm70> {
  // Value size greater than it will bring poor performance for TLPv1.
  static constexpr uint32_t size_tlp_v1 = 16 * sizeof(byte4);
  static constexpr uint32_t size_tlp_v2 = 128 * sizeof(byte4);
};

template <typename K, typename V, typename S, int Strategy, typename ArchTag>
struct KernelSelector_FindOrInsert {
  using ValueConfig = ValueConfig_FindOrInsert<ArchTag>;
  using Params = Params_FindOrInsert<K, V, S>;

  static bool callable(bool unique_key, uint32_t bucket_size, uint32_t dim) {
    constexpr uint32_t MinBucketCap = sizeof(VecD_Load) / sizeof(D);
    if (!unique_key || bucket_size < MinBucketCap) return false;
    uint32_t value_size = dim * sizeof(V);
#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)
    if (value_size <= ValueConfig::size_tlp_v2) return true;
#else
    if (value_size <= ValueConfig::size_tlp_v1) return true;
#endif
    return false;
  }

  static void select_kernel(Params& params, cudaStream_t& stream) {
    const uint32_t total_value_size =
        static_cast<uint32_t>(params.dim * sizeof(V));

    auto launch_TLPv1 = [&]() {
      if (total_value_size % sizeof(byte16) == 0) {
        using VecV = byte16;
        Launch_TLPv1_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte8) == 0) {
        using VecV = byte8;
        Launch_TLPv1_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte4) == 0) {
        using VecV = byte4;
        Launch_TLPv1_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte2) == 0) {
        using VecV = byte2;
        Launch_TLPv1_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else {
        using VecV = byte;
        Launch_TLPv1_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      }
    };

#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)
    auto launch_TLPv2 = [&]() {
      if (total_value_size % sizeof(byte16) == 0) {
        using VecV = byte16;
        Launch_TLPv2_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte8) == 0) {
        using VecV = byte8;
        Launch_TLPv2_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte4) == 0) {
        using VecV = byte4;
        Launch_TLPv2_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte2) == 0) {
        using VecV = byte2;
        Launch_TLPv2_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else {
        using VecV = byte;
        Launch_TLPv2_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      }
    };
#endif

    auto launch_Pipeline = [&]() {
      if (total_value_size % sizeof(byte16) == 0) {
        using VecV = byte16;
        Launch_Pipeline_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte8) == 0) {
        using VecV = byte8;
        Launch_Pipeline_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte4) == 0) {
        using VecV = byte4;
        Launch_Pipeline_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte2) == 0) {
        using VecV = byte2;
        Launch_Pipeline_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else {
        using VecV = byte;
        Launch_Pipeline_FindOrInsert<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      }
    };

    // This part is according to the test on A100.
    if (params.bucket_capacity != 128) {
      if (total_value_size <= ValueConfig::size_tlp_v1) {
        launch_TLPv1();
      } else {
#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)
        launch_TLPv2();
#else
        launch_TLPv1();
#endif
      }
    } else {
      if (total_value_size <= ValueConfig::size_tlp_v1) {
        if (params.load_factor <= 0.98f) {
          launch_TLPv1();
        } else {
          launch_Pipeline();
        }
      } else {
        if (params.load_factor <= 0.95f) {
#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)
          launch_TLPv2();
#else
          launch_Pipeline();
#endif
        } else {
          launch_Pipeline();
        }
      }
    }
  }  // End function
};

/*
 * find or insert with IO operation. This kernel is
 * usually used for the pure HBM mode for better performance.
 */
template <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>
__global__ void find_or_insert_kernel_with_io(
    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,
    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,
    const K* __restrict keys, V* __restrict values, S* __restrict scores,
    const S global_epoch, const size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int* buckets_size = table->buckets_size;

  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_pos = -1;
    const size_t key_idx = t / TILE_SIZE;

    const K find_or_insert_key = keys[key_idx];

    if (IS_RESERVED_KEY<K>(find_or_insert_key)) continue;

    const S find_or_insert_score =
        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);
    V* find_or_insert_value = values + key_idx * dim;

    size_t bkt_idx = 0;
    size_t start_idx = 0;
    int src_lane = -1;
    K evicted_key;

    Bucket<K, V, S>* bucket =
        get_key_position<K>(buckets, find_or_insert_key, bkt_idx, start_idx,
                            buckets_num, bucket_max_size);

    OccupyResult occupy_result{OccupyResult::INITIAL};
    const int bucket_size = buckets_size[bkt_idx];
    do {
      if (bucket_size < bucket_max_size) {
        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(
            g, bucket, find_or_insert_key, find_or_insert_score, evicted_key,
            start_idx, key_pos, src_lane, bucket_max_size);
      } else {
        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
        occupy_result = find_and_lock_when_full<K, V, S, TILE_SIZE,
                                                ScoreFunctor::LOCK_MEM_ORDER,
                                                ScoreFunctor::UNLOCK_MEM_ORDER>(
            g, bucket, find_or_insert_key, find_or_insert_score, evicted_key,
            start_idx, key_pos, src_lane, bucket_max_size);
      }

      occupy_result = g.shfl(occupy_result, src_lane);
    } while (occupy_result == OccupyResult::CONTINUE);

    if (occupy_result == OccupyResult::REFUSED) continue;

    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
        g.thread_rank() == src_lane) {
      atomicAdd(&(buckets_size[bkt_idx]), 1);
    }

    if (occupy_result == OccupyResult::DUPLICATE) {
      copy_vector<V, TILE_SIZE>(g, bucket->vectors + key_pos * dim,
                                find_or_insert_value, dim);
    } else {
      copy_vector<V, TILE_SIZE>(g, find_or_insert_value,
                                bucket->vectors + key_pos * dim, dim);
    }
    if (g.thread_rank() == src_lane) {
      ScoreFunctor::update(bucket, key_pos, scores, key_idx,
                           find_or_insert_score,
                           (occupy_result != OccupyResult::DUPLICATE));
    }

    if (g.thread_rank() == src_lane) {
      bucket->digests(key_pos)[0] = get_digest<K>(find_or_insert_key);
      (bucket->keys(key_pos))
          ->store(find_or_insert_key, ScoreFunctor::UNLOCK_MEM_ORDER);
    }
  }
}

template <typename K, typename V, typename S, int Strategy>
struct SelectFindOrInsertKernelWithIO {
  static void execute_kernel(const float& load_factor, const int& block_size,
                             const size_t bucket_max_size,
                             const size_t buckets_num, const size_t dim,
                             cudaStream_t& stream, const size_t& n,
                             const Table<K, V, S>* __restrict table,
                             Bucket<K, V, S>* buckets, const K* __restrict keys,
                             V* __restrict values, S* __restrict scores,
                             const S global_epoch) {
    if (load_factor <= 0.75) {
      const unsigned int tile_size = 4;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      find_or_insert_kernel_with_io<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, global_epoch, N);
    } else {
      const unsigned int tile_size = 32;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      find_or_insert_kernel_with_io<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, global_epoch, N);
    }
    return;
  }
};

// Use 1 thread to deal with a KV-pair.
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          uint32_t BLOCK_SIZE = 128, int Strategy = -1>
__global__ void find_or_insert_kernel_lock_key_hybrid(
    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,
    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,
    const K* __restrict__ keys, V** __restrict__ value_ptrs,
    S* __restrict__ scores, K** __restrict__ key_ptrs,
    int* __restrict keys_index, bool* __restrict__ founds, uint64_t n,
    const S global_epoch) {
  using BUCKET = Bucket<K, V, S>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  // bucket_capacity is a multiple of 4.
  constexpr uint32_t STRIDE_S = 4;
  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);
  __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  S score{static_cast<S>(EMPTY_SCORE)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  V* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  int32_t* bucket_size_ptr{nullptr};
  uint32_t key_pos = {0};
  uint32_t bucket_size{0};
  if (kv_idx < n) {
    key = keys[kv_idx];

    // help to address the original key after sorting value pointers.
    if (keys_index) {
      keys_index[kv_idx] = kv_idx;
    }

    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);

    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      bucket_size_ptr = buckets_size + bkt_idx;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_size = *bucket_size_ptr;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = reinterpret_cast<V*>(bucket->vectors);
    } else {
      key_ptrs[kv_idx] = nullptr;
      return;
    }
  } else {
    return;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::DUPLICATE;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), false);
        break;
      } else if (bucket_size == bucket_capacity) {
        continue;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = static_cast<K>(EMPTY_KEY);
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::OCCUPIED_EMPTY;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), true);

        atomicAdd(bucket_size_ptr, 1);
        break;
      }
    }
  }

  while (occupy_result == OccupyResult::INITIAL) {
    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);
    S min_score = static_cast<S>(MAX_SCORE);
    int min_pos = -1;
#pragma unroll
    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,
                              sizeof(S) * Load_LEN_S);
    }
    __pipeline_commit();
    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {
      if (i < bucket_capacity - STRIDE_S) {
#pragma unroll
        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
          __pipeline_memcpy_async(
              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,
              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);
        }
      }
      __pipeline_commit();
      __pipeline_wait_prior(1);
      S temp_scores[Load_LEN_S];
      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;
#pragma unroll
      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {
        *reinterpret_cast<byte16*>(temp_scores) =
            *reinterpret_cast<byte16*>(src + k);
#pragma unroll
        for (int j = 0; j < Load_LEN_S; j += 1) {
          S temp_score = temp_scores[j];
          if (temp_score < min_score) {
            auto verify_key_ptr = BUCKET::keys(bucket_keys_ptr, i + k + j);
            auto verify_key =
                verify_key_ptr->load(cuda::std::memory_order_relaxed);
            if (verify_key != static_cast<K>(LOCKED_KEY) &&
                verify_key != static_cast<K>(EMPTY_KEY)) {
              min_score = temp_score;
              min_pos = i + k + j;
            }
          }
        }
      }
    }

    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (score <= min_score) {
      occupy_result = OccupyResult::REFUSED;
      break;
    }
    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);
    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);
    if (expected_key != static_cast<K>(LOCKED_KEY) &&
        expected_key != static_cast<K>(EMPTY_KEY)) {
      bool result = min_score_key->compare_exchange_strong(
          expected_key, static_cast<K>(LOCKED_KEY),
          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      if (result) {
        S* min_score_ptr =
            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);
        auto verify_score_ptr =
            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);
        auto verify_score =
            verify_score_ptr->load(cuda::std::memory_order_relaxed);
        if (verify_score <= min_score) {
          key_pos = min_pos;
          ScoreFunctor::update_with_digest(
              bucket_keys_ptr, key_pos, scores, kv_idx, score, bucket_capacity,
              get_digest<K>(key), (occupy_result != OccupyResult::DUPLICATE));
          if (expected_key == static_cast<K>(RECLAIM_KEY)) {
            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
            atomicAdd(bucket_size_ptr, 1);
          } else {
            occupy_result = OccupyResult::EVICT;
          }
        } else {
          min_score_key->store(expected_key, cuda::std::memory_order_release);
        }
      }
    }
  }

  if (kv_idx < n) {
    if (occupy_result == OccupyResult::REFUSED) {
      value_ptrs[kv_idx] = nullptr;
      key_ptrs[kv_idx] = nullptr;
    } else {
      value_ptrs[kv_idx] = bucket_values_ptr + key_pos * dim;
      founds[kv_idx] = occupy_result == OccupyResult::DUPLICATE;
      auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
      key_ptrs[kv_idx] = reinterpret_cast<K*>(key_address);
    }
  }
}

template <class K, class V, class S, class VecV = byte16>
__global__ void read_or_write_kernel_unlock_key(
    VecV** __restrict table_value_addrs, VecV* __restrict param_values,
    const bool* mask, const int* __restrict param_key_index,
    K** __restrict__ key_ptrs, const K* __restrict__ keys, const size_t dim,
    const size_t N) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;

  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
    int vec_index = int(t / dim);
    int dim_index = t % dim;
    int real_key_index =
        param_key_index != nullptr ? param_key_index[vec_index] : vec_index;

    K* key_ptr = key_ptrs[real_key_index];
    K key = keys[real_key_index];

    /// if found, read the value form table, otherwise write it
    if (table_value_addrs[vec_index] != nullptr) {
      // unlock the key.
      if (key_ptr && dim_index == 0) *key_ptr = key;

      /// find
      if (mask[real_key_index]) {
        param_values[real_key_index * dim + dim_index] =
            table_value_addrs[vec_index][dim_index];
      }
      /// insert
      else {
        table_value_addrs[vec_index][dim_index] =
            param_values[real_key_index * dim + dim_index];
      }
    }
  }
}

/* find or insert with the end-user specified score.
 */
template <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>
__global__ void find_or_insert_kernel(
    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,
    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,
    const K* __restrict keys, V** __restrict vectors, S* __restrict scores,
    bool* __restrict found, int* __restrict keys_index, const S global_epoch,
    const size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int* buckets_size = table->buckets_size;

  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_pos = -1;
    size_t key_idx = t / TILE_SIZE;

    const K find_or_insert_key = keys[key_idx];

    if (IS_RESERVED_KEY<K>(find_or_insert_key)) continue;

    const S find_or_insert_score =
        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);

    size_t bkt_idx = 0;
    size_t start_idx = 0;
    int src_lane = -1;
    K evicted_key;

    Bucket<K, V, S>* bucket =
        get_key_position<K>(buckets, find_or_insert_key, bkt_idx, start_idx,
                            buckets_num, bucket_max_size);

    if (g.thread_rank() == 0) {
      *(keys_index + key_idx) = key_idx;
    }

    OccupyResult occupy_result{OccupyResult::INITIAL};
    const int bucket_size = buckets_size[bkt_idx];
    do {
      if (bucket_size < bucket_max_size) {
        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(
            g, bucket, find_or_insert_key, find_or_insert_score, evicted_key,
            start_idx, key_pos, src_lane, bucket_max_size);
      } else {
        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
        occupy_result = find_and_lock_when_full<K, V, S, TILE_SIZE,
                                                ScoreFunctor::LOCK_MEM_ORDER,
                                                ScoreFunctor::UNLOCK_MEM_ORDER>(
            g, bucket, find_or_insert_key, find_or_insert_score, evicted_key,
            start_idx, key_pos, src_lane, bucket_max_size);
      }

      occupy_result = g.shfl(occupy_result, src_lane);
    } while (occupy_result == OccupyResult::CONTINUE);

    if (occupy_result == OccupyResult::REFUSED) continue;

    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
        g.thread_rank() == src_lane) {
      atomicAdd(&(buckets_size[bkt_idx]), 1);
    }

    if (g.thread_rank() == src_lane) {
      *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
      ScoreFunctor::update(bucket, key_pos, scores, key_idx,
                           find_or_insert_score,
                           occupy_result != OccupyResult::DUPLICATE);
      if (occupy_result == OccupyResult::DUPLICATE) {
        if (found != nullptr) {
          *(found + key_idx) = true;
        }
      }
      bucket->digests(key_pos)[0] = get_digest<K>(find_or_insert_key);
      (bucket->keys(key_pos))
          ->store(find_or_insert_key, ScoreFunctor::UNLOCK_MEM_ORDER);
    }
  }
}

/* Read the data from address of table_value_addrs to corresponding position
  in param_value if mask[i] is true, otherwise write data to table_value_addrs
  form param_value,
  usually called by find_or_insert kernel.

  `table_value_addrs`: A pointer of pointer of V which should be on HBM,
        but each value (a pointer of V) could point to a
        memory on HBM or HMEM.
  `param_value`: A continue memory pointer with Vector
        which should be HBM.
  `mask`: One for each `param_value`. If true, reading from table_value_addrs,
          or false writing table_value_addrs from  param_value.
  `param_key_index`: N values from address of table_value_addrs are mapped to
        param_values according to param_key_index.
  `dim`: the dim of value.
  `N`: The number of vectors needed to be read.
*/
template <class K, class V, class S>
__global__ void read_or_write_kernel(V** __restrict table_value_addrs,
                                     V* __restrict param_values,
                                     const bool* mask,
                                     const int* __restrict param_key_index,
                                     const size_t dim, const size_t N) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;

  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
    int vec_index = int(t / dim);
    int dim_index = t % dim;
    int real_key_index =
        param_key_index != nullptr ? param_key_index[vec_index] : vec_index;

    /// if found, read the value form table, otherwise write it
    if (table_value_addrs[vec_index] != nullptr) {
      /// find
      if (mask[real_key_index]) {
        param_values[real_key_index * dim + dim_index] =
            table_value_addrs[vec_index][dim_index];
      }
      /// insert
      else {
        table_value_addrs[vec_index][dim_index] =
            param_values[real_key_index * dim + dim_index];
      }
    }
  }
}

/* If founds[i] = true, read data from corresponding address of
 * table_value_addrs and write to param_values; if founds[i] = false, write data
 * from param_values to corresponding address of table_value_addrs. usually
 * called by find_or_insert kernel.
 */
template <class V>
void read_or_write_by_cpu(V** __restrict table_value_addrs,
                          V* __restrict param_values,
                          const int* __restrict offset, const bool* founds,
                          size_t dim, int N, int n_worker = 16) {
  std::vector<std::thread> thds;
  if (n_worker < 1) n_worker = 1;

  auto functor = [founds, dim](V** __restrict table_value_addrs,
                               V* __restrict param_values,
                               const int* __restrict offset, int handled_size,
                               int trunk_size) -> void {
    for (int i = handled_size; i < handled_size + trunk_size; i++) {
      if (table_value_addrs[i] != nullptr) {
        if (founds[offset[i]]) {
          memcpy(param_values + offset[i] * dim, table_value_addrs[i],
                 sizeof(V) * dim);
        } else {
          memcpy(table_value_addrs[i], param_values + offset[i] * dim,
                 sizeof(V) * dim);
        }
      }
    }
  };

  int32_t trunk_size_floor = N / n_worker;
  int32_t trunk_size_remain = N % n_worker;
  int32_t n_worker_used = trunk_size_floor == 0 ? trunk_size_remain : n_worker;

  size_t handled_size = 0;
  for (int i = 0; i < n_worker_used; i++) {
    int32_t cur_trunk_size = trunk_size_floor;
    if (trunk_size_remain != 0) {
      cur_trunk_size += 1;
      trunk_size_remain--;
    }
    thds.push_back(std::thread(functor, table_value_addrs, param_values, offset,
                               handled_size, cur_trunk_size));
    handled_size += cur_trunk_size;
  }

  for (int i = 0; i < n_worker_used; i++) {
    thds[i].join();
  }
}

}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/core_kernels/find_ptr_or_insert.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "kernel_utils.cuh"

namespace nv {
namespace merlin {

// Use 1 thread to deal with a KV-pair.
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          uint32_t BLOCK_SIZE = 128, int Strategy = -1>
__global__ void find_or_insert_ptr_kernel_lock_key(
    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,
    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,
    const K* __restrict__ keys, V** __restrict__ value_ptrs,
    S* __restrict__ scores, K** __restrict__ key_ptrs, uint64_t n,
    bool* __restrict__ founds, const S global_epoch) {
  using BUCKET = Bucket<K, V, S>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  // bucket_capacity is a multiple of 4.
  constexpr uint32_t STRIDE_S = 4;
  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);
  __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  S score{static_cast<S>(EMPTY_SCORE)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  V* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  int32_t* bucket_size_ptr{nullptr};
  uint32_t key_pos = {0};
  uint32_t bucket_size{0};
  if (kv_idx < n) {
    key = keys[kv_idx];
    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);

    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      bucket_size_ptr = buckets_size + bkt_idx;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_size = *bucket_size_ptr;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = reinterpret_cast<V*>(bucket->vectors);
    } else {
      key_ptrs[kv_idx] = nullptr;
      return;
    }
  } else {
    return;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::DUPLICATE;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), false);
        break;
      } else if (bucket_size == bucket_capacity) {
        continue;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = static_cast<K>(EMPTY_KEY);
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::OCCUPIED_EMPTY;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), true);

        atomicAdd(bucket_size_ptr, 1);
        break;
      }
    }
  }

  while (occupy_result == OccupyResult::INITIAL) {
    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);
    S min_score = static_cast<S>(MAX_SCORE);
    int min_pos = -1;
#pragma unroll
    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,
                              sizeof(S) * Load_LEN_S);
    }
    __pipeline_commit();
    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {
      if (i < bucket_capacity - STRIDE_S) {
#pragma unroll
        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
          __pipeline_memcpy_async(
              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,
              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);
        }
      }
      __pipeline_commit();
      __pipeline_wait_prior(1);
      S temp_scores[Load_LEN_S];
      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;
#pragma unroll
      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {
        *reinterpret_cast<byte16*>(temp_scores) =
            *reinterpret_cast<byte16*>(src + k);
#pragma unroll
        for (int j = 0; j < Load_LEN_S; j += 1) {
          S temp_score = temp_scores[j];
          if (temp_score < min_score) {
            auto verify_key_ptr = BUCKET::keys(bucket_keys_ptr, i + k + j);
            auto verify_key =
                verify_key_ptr->load(cuda::std::memory_order_relaxed);
            if (verify_key != static_cast<K>(LOCKED_KEY) &&
                verify_key != static_cast<K>(EMPTY_KEY)) {
              min_score = temp_score;
              min_pos = i + k + j;
            }
          }
        }
      }
    }

    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (score <= min_score) {
      occupy_result = OccupyResult::REFUSED;
      break;
    }
    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);
    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);
    if (expected_key != static_cast<K>(LOCKED_KEY) &&
        expected_key != static_cast<K>(EMPTY_KEY)) {
      bool result = min_score_key->compare_exchange_strong(
          expected_key, static_cast<K>(LOCKED_KEY),
          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      if (result) {
        S* min_score_ptr =
            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);
        auto verify_score_ptr =
            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);
        auto verify_score =
            verify_score_ptr->load(cuda::std::memory_order_relaxed);
        if (verify_score <= min_score) {
          key_pos = min_pos;
          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                           kv_idx, score, bucket_capacity,
                                           get_digest<K>(key), true);

          if (expected_key == static_cast<K>(RECLAIM_KEY)) {
            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
            atomicAdd(bucket_size_ptr, 1);
          } else {
            occupy_result = OccupyResult::EVICT;
          }
        } else {
          min_score_key->store(expected_key, cuda::std::memory_order_release);
        }
      }
    }
  }

  if (kv_idx < n) {
    if (occupy_result == OccupyResult::REFUSED) {
      value_ptrs[kv_idx] = nullptr;
      key_ptrs[kv_idx] = nullptr;
    } else {
      value_ptrs[kv_idx] = bucket_values_ptr + key_pos * dim;
      auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
      key_ptrs[kv_idx] = reinterpret_cast<K*>(key_address);
    }
    founds[kv_idx] = occupy_result == OccupyResult::DUPLICATE;
  }
}

template <typename K>
__global__ void find_or_insert_ptr_kernel_unlock_key(const K* __restrict__ keys,
                                                     K** __restrict__ key_ptrs,
                                                     uint64_t n) {
  int kv_idx = blockIdx.x * blockDim.x + threadIdx.x;
  K key;
  K* key_ptr{nullptr};
  if (kv_idx < n) {
    key = keys[kv_idx];
    key_ptr = key_ptrs[kv_idx];
    if (key_ptr) {
      *key_ptr = key;
    }
  }
}

/* find or insert with the end-user specified score.
 */
template <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>
__global__ void find_ptr_or_insert_kernel(
    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,
    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,
    const K* __restrict keys, V** __restrict vectors, S* __restrict scores,
    bool* __restrict found, const S global_epoch, const size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int* buckets_size = table->buckets_size;

  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_pos = -1;
    size_t key_idx = t / TILE_SIZE;

    const K find_or_insert_key = keys[key_idx];

    if (IS_RESERVED_KEY<K>(find_or_insert_key)) continue;

    const S find_or_insert_score =
        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);

    size_t bkt_idx = 0;
    size_t start_idx = 0;
    int src_lane = -1;
    K evicted_key;

    Bucket<K, V, S>* bucket =
        get_key_position<K>(buckets, find_or_insert_key, bkt_idx, start_idx,
                            buckets_num, bucket_max_size);

    OccupyResult occupy_result{OccupyResult::INITIAL};
    const int bucket_size = buckets_size[bkt_idx];
    do {
      if (bucket_size < bucket_max_size) {
        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(
            g, bucket, find_or_insert_key, find_or_insert_score, evicted_key,
            start_idx, key_pos, src_lane, bucket_max_size);
      } else {
        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
        occupy_result = find_and_lock_when_full<K, V, S, TILE_SIZE,
                                                ScoreFunctor::LOCK_MEM_ORDER,
                                                ScoreFunctor::UNLOCK_MEM_ORDER>(
            g, bucket, find_or_insert_key, find_or_insert_score, evicted_key,
            start_idx, key_pos, src_lane, bucket_max_size);
      }

      occupy_result = g.shfl(occupy_result, src_lane);
    } while (occupy_result == OccupyResult::CONTINUE);

    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
        g.thread_rank() == src_lane) {
      atomicAdd(&(buckets_size[bkt_idx]), 1);
    }

    if (g.thread_rank() == src_lane) {
      if (occupy_result != OccupyResult::REFUSED) {
        ScoreFunctor::update(bucket, key_pos, scores, key_idx,
                             find_or_insert_score,
                             occupy_result != OccupyResult::DUPLICATE);
        bucket->digests(key_pos)[0] = get_digest<K>(find_or_insert_key);
        (bucket->keys(key_pos))
            ->store(find_or_insert_key, ScoreFunctor::UNLOCK_MEM_ORDER);
        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
      } else {
        *(vectors + key_idx) = nullptr;
      }
      *(found + key_idx) = occupy_result == OccupyResult::DUPLICATE;
    }
  }
}

template <typename K, typename V, typename S, int Strategy>
struct SelectFindOrInsertPtrKernel {
  static void execute_kernel(const float& load_factor, const int& block_size,
                             const size_t bucket_max_size,
                             const size_t buckets_num, const size_t dim,
                             cudaStream_t& stream, const size_t& n,
                             const Table<K, V, S>* __restrict table,
                             Bucket<K, V, S>* buckets, const K* __restrict keys,
                             V** __restrict values, S* __restrict scores,
                             bool* __restrict found, const S global_epoch) {
    if (load_factor <= 0.5) {
      const unsigned int tile_size = 4;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      find_ptr_or_insert_kernel<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, found, global_epoch, N);
    } else if (load_factor <= 0.875) {
      const unsigned int tile_size = 8;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      find_ptr_or_insert_kernel<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, found, global_epoch, N);
    } else {
      const unsigned int tile_size = 32;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      find_ptr_or_insert_kernel<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, found, global_epoch, N);
    }
    return;
  }
};

}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/core_kernels/group_lock_kernels.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once
#include <cuda/atomic>
#include <cuda/std/semaphore>

namespace nv {
namespace merlin {
namespace group_lock {

template <typename T>
__global__ void init_kernel(
    cuda::atomic<T, cuda::thread_scope_device>* update_count,
    cuda::atomic<T, cuda::thread_scope_device>* read_count,
    cuda::atomic<bool, cuda::thread_scope_device>* unique_flag) {
  if (blockIdx.x == 0 && threadIdx.x == 0) {
    new (update_count) cuda::atomic<T, cuda::thread_scope_device>{0};
    new (read_count) cuda::atomic<T, cuda::thread_scope_device>{0};
    new (unique_flag) cuda::atomic<bool, cuda::thread_scope_device>{false};
  }
}

template <typename T>
__global__ void lock_read_kernel(
    cuda::atomic<T, cuda::thread_scope_device>* update_count,
    cuda::atomic<T, cuda::thread_scope_device>* read_count) {
  for (;;) {
    while (update_count->load(cuda::std::memory_order_relaxed)) {
    }
    read_count->fetch_add(1, cuda::std::memory_order_relaxed);
    if (update_count->load(cuda::std::memory_order_relaxed) == 0) {
      break;
    }
    read_count->fetch_sub(1, cuda::std::memory_order_relaxed);
  }
}

template <typename T>
__global__ void unlock_read_kernel(
    cuda::atomic<T, cuda::thread_scope_device>* read_count) {
  read_count->fetch_sub(1, cuda::std::memory_order_relaxed);
}

template <typename T>
__global__ void lock_update_kernel(
    cuda::atomic<T, cuda::thread_scope_device>* update_count,
    cuda::atomic<T, cuda::thread_scope_device>* read_count) {
  for (;;) {
    while (read_count->load(cuda::std::memory_order_relaxed)) {
    }
    update_count->fetch_add(1, cuda::std::memory_order_relaxed);
    if (read_count->load(cuda::std::memory_order_relaxed) == 0) {
      break;
    }
    update_count->fetch_sub(1, cuda::std::memory_order_relaxed);
  }
}

template <typename T>
__global__ void unlock_update_kernel(
    cuda::atomic<T, cuda::thread_scope_device>* update_count) {
  update_count->fetch_sub(1, cuda::std::memory_order_relaxed);
}

template <typename T>
__global__ void lock_update_read_kernel(
    cuda::atomic<T, cuda::thread_scope_device>* update_count,
    cuda::atomic<T, cuda::thread_scope_device>* read_count,
    cuda::atomic<bool, cuda::thread_scope_device>* unique_flag) {
  /* Lock unique flag */
  bool expected = false;
  while (!unique_flag->compare_exchange_weak(expected, true,
                                             cuda::std::memory_order_relaxed)) {
    expected = false;
  }

  /* Ban update */
  for (;;) {
    while (update_count->load(cuda::std::memory_order_relaxed)) {
    }
    read_count->fetch_add(1, cuda::std::memory_order_relaxed);
    if (update_count->load(cuda::std::memory_order_relaxed) == 0) {
      break;
    }
    read_count->fetch_sub(1, cuda::std::memory_order_relaxed);
  }

  /* Ban read */
  for (;;) {
    while (read_count->load(cuda::std::memory_order_relaxed) > 1) {
    }
    update_count->fetch_add(1, cuda::std::memory_order_relaxed);
    if (read_count->load(cuda::std::memory_order_relaxed) == 1) {
      break;
    }
    update_count->fetch_sub(1, cuda::std::memory_order_relaxed);
  }
}

template <typename T>
__global__ void unlock_update_read_kernel(
    cuda::atomic<T, cuda::thread_scope_device>* update_count,
    cuda::atomic<T, cuda::thread_scope_device>* read_count,
    cuda::atomic<bool, cuda::thread_scope_device>* unique_flag) {
  read_count->fetch_sub(1, cuda::std::memory_order_relaxed);
  update_count->fetch_sub(1, cuda::std::memory_order_relaxed);
  unique_flag->store(false, cuda::std::memory_order_relaxed);
}

template <typename T>
__global__ void update_count_kernel(
    T* counter, cuda::atomic<T, cuda::thread_scope_device>* update_count) {
  *counter = update_count->load(cuda::std::memory_order_relaxed);
}

template <typename T>
__global__ void read_count_kernel(
    T* counter, cuda::atomic<T, cuda::thread_scope_device>* read_count) {
  *counter = read_count->load(cuda::std::memory_order_relaxed);
}

}  // namespace group_lock
}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/core_kernels/kernel_utils.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <cuda_pipeline.h>
#include <cuda/barrier>
#include <mutex>
#include <thread>
#include <vector>
#include "../types.cuh"
#include "../utils.cuh"

using namespace cooperative_groups;
namespace cg = cooperative_groups;

namespace nv {
namespace merlin {

// Vector Type of digests for memory access.
using VecD_Load = byte16;
// Vector Type of digests for computation.
using VecD_Comp = byte4;

template <typename T>
__forceinline__ __device__ T* __shfl_sync_ptr(uint32_t mask, T* var,
                                              int srcLane,
                                              int width = warpSize) {
  uint64_t var64 = reinterpret_cast<uint64_t>(var);
  var64 = __shfl_sync(mask, var64, srcLane, width);
  return reinterpret_cast<T*>(var64);
}

// Select from double buffer.
// If i % 2 == 0, select buffer 0, else buffer 1.
__forceinline__ __device__ int same_buf(int i) { return (i & 0x01) ^ 0; }
// If i % 2 == 0, select buffer 1, else buffer 0.
__forceinline__ __device__ int diff_buf(int i) { return (i & 0x01) ^ 1; }

/**
 * Digest functions for single-bucket mode.
 *
 * A digest is a 1-byte fingerprint (bits [32:39] of the Murmur3 hash) stored
 * alongside each key in the bucket.  During lookup, the warp first performs a
 * SIMD comparison (`__vcmpeq4`) of the target digest against all 128 stored
 * digests.  Only slots whose digest matches proceed to full 8-byte key
 * comparison.  With a random 8-bit digest, the expected false-positive rate
 * is 1/256 per occupied slot, reducing full-key comparisons from O(bucket_size)
 * to ~0.5 per lookup miss.
 *
 * NOTE: Some pipeline kernels (lookup.cuh, contains.cuh) compute the target
 * digest inline as `hashed_key >> 32` for performance, bypassing
 * `get_digest()`. Any change to the digest derivation must be reflected in
 * those locations too.
 */

template <typename K>
__forceinline__ __device__ D empty_digest() {
  const K hashed_key = Murmur3HashDevice(static_cast<K>(EMPTY_KEY));
  return static_cast<D>(hashed_key >> 32);
}

template <typename K>
__forceinline__ __device__ D reclaim_digest() {
  const K hashed_key = Murmur3HashDevice(static_cast<K>(RECLAIM_KEY));
  return static_cast<D>(hashed_key >> 32);
}

// Target digest for a given key (bits [32:39] of Murmur3 hash).
template <typename K>
__forceinline__ __device__ D get_digest(const K& key) {
  const K hashed_key = Murmur3HashDevice(key);
  return static_cast<D>(hashed_key >> 32);
}

// Pack digest into all 4 bytes for SIMD `__vcmpeq4` comparison.
template <typename K>
__forceinline__ __device__ VecD_Comp digests_from_hashed(const K& hashed_key) {
  D digest = static_cast<D>(hashed_key >> 32);
  return static_cast<VecD_Comp>(__byte_perm(digest, digest, 0x0000));
}

// Pack empty-key digest into all 4 bytes for SIMD comparison.
template <typename K>
__forceinline__ __device__ VecD_Comp empty_digests() {
  D digest = empty_digest<K>();
  return static_cast<VecD_Comp>(__byte_perm(digest, digest, 0x0000));
}

// Position alignment.
template <uint32_t ALIGN_SIZE>
__forceinline__ __device__ uint32_t align_to(uint32_t& pos) {
  constexpr uint32_t MASK = 0xffffffffU - (ALIGN_SIZE - 1);
  return pos & MASK;
}

template <typename ElementType>
__forceinline__ __device__ void LDGSTS(ElementType* dst,
                                       const ElementType* src);

template <>
__forceinline__ __device__ void LDGSTS<byte>(byte* dst, const byte* src) {
  byte element = *src;
  *dst = element;
}

template <>
__forceinline__ __device__ void LDGSTS<byte2>(byte2* dst, const byte2* src) {
  byte2 element = *src;
  *dst = element;
}

// Require compute ability >= 8.0
template <typename ElementType>
__forceinline__ __device__ void LDGSTS(ElementType* dst,
                                       const ElementType* src) {
  __pipeline_memcpy_async(dst, src, sizeof(ElementType));
}

template <typename S, typename K, int BUCKET_SIZE = 128>
struct CopyScoreEmpty {
  __forceinline__ __device__ static S* get_base_ptr(K** keys_ptr, int offset) {
    return nullptr;
  }
  __forceinline__ __device__ static void ldg_sts(S* dst, const S* src) {}
  __forceinline__ __device__ static S lgs(const S* src) { return 0; }
  __forceinline__ __device__ static void stg(S* dst, const S score_) {}
};

template <typename S, typename K, int BUCKET_SIZE = 128>
struct CopyScoreByPassCache {
  __forceinline__ __device__ static S* get_base_ptr(K** keys_ptr, int offset) {
    return reinterpret_cast<S*>(keys_ptr[offset] + BUCKET_SIZE);
  }

  __forceinline__ __device__ static void ldg_sts(S* dst, const S* src) {
    LDGSTS<S>(dst, src);
  }

  __forceinline__ __device__ static S lgs(const S* src) { return src[0]; }

  __forceinline__ __device__ static void stg(S* dst, const S score_) {
    __stcs(dst, score_);
  }
};

template <typename VecV = byte16, int GROUP_SIZE = 16>
struct CopyValueOneGroup {
  __forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,
                                                 const VecV* src, int dim) {
    int offset = rank;
    if (offset < dim) LDGSTS<VecV>(dst + offset, src + offset);
  }

  __forceinline__ __device__ static void lds_stg(int rank, VecV* dst,
                                                 const VecV* src, int dim) {
    int offset = rank;
    if (offset < dim) {
      VecV vec_v = src[offset];
      __stcs(dst + offset, vec_v);
    }
  }
};

template <typename VecV = byte16, int GROUP_SIZE = 16>
struct CopyValueTwoGroup {
  __forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,
                                                 const VecV* src,
                                                 const int dim) {
    int offset = rank;
    LDGSTS<VecV>(dst + offset, src + offset);
    offset += GROUP_SIZE;
    if (offset < dim) LDGSTS<VecV>(dst + offset, src + offset);
  }

  __forceinline__ __device__ static void lds_stg(int rank, VecV* dst,
                                                 const VecV* src,
                                                 const int dim) {
    int offset = rank;
    const VecV vec_v = src[offset];
    __stcs(dst + offset, vec_v);
    offset += GROUP_SIZE;
    if (offset < dim) {
      const VecV vec_v = src[offset];
      __stcs(dst + offset, vec_v);
    }
  }
};

template <typename VecV = byte16, int GROUP_SIZE = 16>
struct CopyValueMultipleGroup {
  __forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,
                                                 const VecV* src,
                                                 const int dim) {
    for (int offset = rank; offset < dim; offset += GROUP_SIZE) {
      LDGSTS<VecV>(dst + offset, src + offset);
    }
  }

  __forceinline__ __device__ static void lds_stg(int rank, VecV* dst,
                                                 const VecV* src,
                                                 const int dim) {
    for (int offset = rank; offset < dim; offset += GROUP_SIZE) {
      VecV vec_v = src[offset];
      __stcs(dst + offset, vec_v);
    }
  }

  __forceinline__ __device__ static void ldg_stg(int rank, VecV* dst,
                                                 const VecV* src,
                                                 const int dim) {
    for (int offset = rank; offset < dim; offset += GROUP_SIZE) {
      VecV vec_v = __ldcs(src + offset);
      __stcs(dst + offset, vec_v);
    }
  }
};

template <typename K, typename S>
__forceinline__ __device__ void evict_key_score(K* evicted_keys,
                                                S* evicted_scores,
                                                const uint32_t evict_idx,
                                                const K& key, const S& score) {
  // Cache with evict_first strategy.
  __stcs(evicted_keys + evict_idx, key);
  if (evicted_scores != nullptr) {
    __stcs(evicted_scores + evict_idx, score);
  }
};

template <class K, class V, class S, int Strategy>
struct ScoreFunctor;

constexpr int EPOCH_BITS = 32;

constexpr uint64_t EPOCH_BITS_MASK = UINT64_C(0xFFFFFFFF00000000);
constexpr uint64_t SCORE_BITS_MASK = UINT64_C(0xFFFFFFFF);
constexpr uint64_t SCORE_32BIT_MAX = UINT64_C(0xFFFFFFFF);
/* The granularity of timestamp in the lower 32 bits is 1.048576ms. */
static constexpr int RSHIFT_ON_NANO = 20;

template <class S>
__forceinline__ __device__ S make_epoch(const S& epoch) {
  return epoch << EPOCH_BITS;
}

template <class S>
__forceinline__ __device__ S make_nano() {
  return (SCORE_BITS_MASK & (device_nano<S>() >> RSHIFT_ON_NANO));
}

template <class K, class V, class S>
struct ScoreFunctor<K, V, S, EvictStrategyInternal::kLru> {
  static constexpr cuda::std::memory_order LOCK_MEM_ORDER =
      cuda::std::memory_order_relaxed;
  static constexpr cuda::std::memory_order UNLOCK_MEM_ORDER =
      cuda::std::memory_order_relaxed;
  using BUCKET = Bucket<K, V, S>;

  __forceinline__ __device__ static S desired_when_missed(
      const S* __restrict const input_scores, const int key_idx,
      const S& epoch) {
    return device_nano<S>();
  }

  __forceinline__ __device__ static void update(
      BUCKET* __restrict bucket, const int key_pos,
      const S* __restrict const input_scores, const int key_idx,
      const S& desired_score_when_missed, const bool new_insert) {
    bucket->scores(key_pos)->store(desired_score_when_missed,
                                   cuda::std::memory_order_relaxed);
    return;
  }

  __forceinline__ __device__ static void update_with_digest(
      K* __restrict bucket_key_ptr, const uint32_t& key_pos,
      const S* __restrict const input_scores, const uint32_t& key_idx,
      const S& desired_score_when_missed, const uint32_t& bucket_capacity,
      const D& digest, const bool new_insert) {
    S* dst_score_ptr = BUCKET::scores(bucket_key_ptr, bucket_capacity, key_pos);
    D* dst_digest_ptr =
        BUCKET::digests(bucket_key_ptr, bucket_capacity, key_pos);
    // Cache in L2 cache, bypass L1 Cache.
    __stcg(dst_digest_ptr, digest);
    __stcg(dst_score_ptr, device_nano<S>());
    return;
  }

  __forceinline__ __device__ static void update_without_missed(
      BUCKET* __restrict bucket, const int key_pos,
      const S* __restrict const input_scores, const int key_idx,
      const S& epoch) {
    bucket->scores(key_pos)->store(device_nano<S>(),
                                   cuda::std::memory_order_relaxed);
    return;
  }

  __forceinline__ __device__ static void update_without_missed(
      K* bucket_keys_ptr, const uint32_t bucket_capacity,
      const uint32_t key_pos, const S* __restrict const input_scores,
      const int key_idx, const S& epoch) {
    S* dst_score_ptr =
        BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);
    // Cache in L2 cache, bypass L1 Cache.
    __stcg(dst_score_ptr, device_nano<S>());
  }
};

template <class K, class V, class S>
struct ScoreFunctor<K, V, S, EvictStrategyInternal::kLfu> {
  static constexpr cuda::std::memory_order LOCK_MEM_ORDER =
      cuda::std::memory_order_acquire;
  static constexpr cuda::std::memory_order UNLOCK_MEM_ORDER =
      cuda::std::memory_order_release;
  using BUCKET = Bucket<K, V, S>;

  __forceinline__ __device__ static S desired_when_missed(
      const S* __restrict const input_scores, const int key_idx,
      const S& epoch) {
    return static_cast<S>(MAX_SCORE);
  }

  __forceinline__ __device__ static void update(
      BUCKET* __restrict bucket, const int key_pos,
      const S* __restrict const input_scores, const int key_idx,
      const S& desired_score_when_missed, const bool new_insert) {
    if (input_scores == nullptr) return;
    if (new_insert) {
      bucket->scores(key_pos)->store(input_scores[key_idx],
                                     cuda::std::memory_order_relaxed);
    } else {
      bucket->scores(key_pos)->fetch_add(input_scores[key_idx],
                                         cuda::std::memory_order_relaxed);
    }
    return;
  }

  __forceinline__ __device__ static void update_with_digest(
      K* __restrict bucket_key_ptr, const uint32_t& key_pos,
      const S* __restrict const input_scores, const uint32_t& key_idx,
      const S& desired_score_when_missed, const uint32_t& bucket_capacity,
      const D& digest, const bool new_insert) {
    if (input_scores == nullptr) return;

    S* dst_score_ptr = BUCKET::scores(bucket_key_ptr, bucket_capacity, key_pos);
    D* dst_digest_ptr =
        BUCKET::digests(bucket_key_ptr, bucket_capacity, key_pos);
    // Cache in L2 cache, bypass L1 Cache.
    __stcg(dst_digest_ptr, digest);
    if (new_insert) {
      __stcg(dst_score_ptr, input_scores[key_idx]);
    } else {
      __stcg(dst_score_ptr, input_scores[key_idx] + *dst_score_ptr);
    }
    return;
  }

  __forceinline__ __device__ static void update_without_missed(
      BUCKET* __restrict bucket, const int key_pos,
      const S* __restrict const input_scores, const int key_idx,
      const S& epoch) {
    if (input_scores == nullptr) return;
    bucket->scores(key_pos)->fetch_add(input_scores[key_idx],
                                       cuda::std::memory_order_relaxed);
    return;
  }

  __forceinline__ __device__ static void update_without_missed(
      K* bucket_keys_ptr, const uint32_t bucket_capacity,
      const uint32_t key_pos, const S* __restrict const input_scores,
      const int key_idx, const S& epoch) {
    if (input_scores == nullptr) return;
    S* dst_score_ptr =
        BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);
    // Cache in L2 cache, bypass L1 Cache.
    __stcg(dst_score_ptr, input_scores[key_idx] + *dst_score_ptr);
  }
};

template <class K, class V, class S>
struct ScoreFunctor<K, V, S, EvictStrategyInternal::kEpochLru> {
  static constexpr cuda::std::memory_order LOCK_MEM_ORDER =
      cuda::std::memory_order_relaxed;
  static constexpr cuda::std::memory_order UNLOCK_MEM_ORDER =
      cuda::std::memory_order_relaxed;
  using BUCKET = Bucket<K, V, S>;

  __forceinline__ __device__ static S desired_when_missed(
      const S* __restrict const input_scores, const int key_idx,
      const S& epoch) {
    if (epoch == static_cast<S>(IGNORED_GLOBAL_EPOCH) &&
        input_scores != nullptr) {
      return input_scores[key_idx];
    }
    return make_epoch<S>(epoch) | make_nano<S>();
  }

  __forceinline__ __device__ static void update(
      BUCKET* __restrict bucket, const int key_pos,
      const S* __restrict const input_scores, const int key_idx,
      const S& desired_score_when_missed, const bool new_insert) {
    bucket->scores(key_pos)->store(desired_score_when_missed,
                                   cuda::std::memory_order_relaxed);
    return;
  }

  __forceinline__ __device__ static void update_with_digest(
      K* __restrict bucket_key_ptr, const uint32_t& key_pos,
      const S* __restrict const input_scores, const uint32_t& key_idx,
      const S& desired_score_when_missed, const uint32_t& bucket_capacity,
      const D& digest, const bool new_insert) {
    S* dst_score_ptr = BUCKET::scores(bucket_key_ptr, bucket_capacity, key_pos);
    D* dst_digest_ptr =
        BUCKET::digests(bucket_key_ptr, bucket_capacity, key_pos);
    // Cache in L2 cache, bypass L1 Cache.
    __stcg(dst_digest_ptr, digest);
    __stcg(dst_score_ptr, desired_score_when_missed);
    return;
  }

  __forceinline__ __device__ static void update_without_missed(
      BUCKET* __restrict bucket, const int key_pos,
      const S* __restrict const input_scores, const int key_idx,
      const S& epoch) {
    bucket->scores(key_pos)->store(make_epoch<S>(epoch) | make_nano<S>(),
                                   cuda::std::memory_order_relaxed);
    return;
  }

  __forceinline__ __device__ static void update_without_missed(
      K* bucket_keys_ptr, const uint32_t bucket_capacity,
      const uint32_t key_pos, const S* __restrict const input_scores,
      const int key_idx, const S& epoch) {
    S* dst_score_ptr =
        BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);
    // Cache in L2 cache, bypass L1 Cache.
    __stcg(dst_score_ptr, make_epoch<S>(epoch) | make_nano<S>());
  }
};

template <class K, class V, class S>
struct ScoreFunctor<K, V, S, EvictStrategyInternal::kEpochLfu> {
  static constexpr cuda::std::memory_order LOCK_MEM_ORDER =
      cuda::std::memory_order_relaxed;
  static constexpr cuda::std::memory_order UNLOCK_MEM_ORDER =
      cuda::std::memory_order_relaxed;
  using BUCKET = Bucket<K, V, S>;

  __forceinline__ __device__ static S desired_when_missed(
      const S* __restrict const input_scores, const int key_idx,
      const S epoch) {
    if (epoch == static_cast<S>(IGNORED_GLOBAL_EPOCH)) {
      return input_scores[key_idx];
    }
    return make_epoch<S>(epoch) | (input_scores[key_idx] & SCORE_BITS_MASK);
  }

  __forceinline__ __device__ static void update(
      BUCKET* __restrict bucket, const int key_pos,
      const S* __restrict const input_scores, const int key_idx,
      const S& desired_score_when_missed, const bool new_insert) {
    S new_score = desired_score_when_missed;
    if (!new_insert) {
      new_score =
          (bucket->scores(key_pos)->load(cuda::std::memory_order_relaxed) &
           SCORE_BITS_MASK);
      if (SCORE_32BIT_MAX - new_score >
          (desired_score_when_missed & SCORE_BITS_MASK)) {
        new_score += desired_score_when_missed;
      } else {
        new_score =
            (desired_score_when_missed & EPOCH_BITS_MASK) | SCORE_32BIT_MAX;
      }
    }
    bucket->scores(key_pos)->store(new_score, cuda::std::memory_order_relaxed);
    return;
  }

  __forceinline__ __device__ static void update_with_digest(
      K* __restrict bucket_key_ptr, const uint32_t& key_pos,
      const S* __restrict const input_scores, const uint32_t& key_idx,
      const S& desired_score_when_missed, const uint32_t& bucket_capacity,
      const D& digest, const bool new_insert) {
    S new_score = desired_score_when_missed;
    S* dst_score_ptr = BUCKET::scores(bucket_key_ptr, bucket_capacity, key_pos);
    D* dst_digest_ptr =
        BUCKET::digests(bucket_key_ptr, bucket_capacity, key_pos);
    if (!new_insert) {
      new_score = (*dst_score_ptr & SCORE_BITS_MASK);
      if (SCORE_32BIT_MAX - new_score >
          (desired_score_when_missed & SCORE_BITS_MASK)) {
        new_score += desired_score_when_missed;
      } else {
        new_score =
            (desired_score_when_missed & EPOCH_BITS_MASK) | SCORE_32BIT_MAX;
      }
    }
    // Cache in L2 cache, bypass L1 Cache.
    __stcg(dst_digest_ptr, digest);
    __stcg(dst_score_ptr, new_score);
    return;
  }

  __forceinline__ __device__ static void update_without_missed(
      BUCKET* __restrict bucket, const int key_pos,
      const S* __restrict const input_scores, const int key_idx,
      const S& epoch) {
    if (input_scores == nullptr) return;
    S new_score =
        (bucket->scores(key_pos)->load(cuda::std::memory_order_relaxed) &
         SCORE_BITS_MASK);
    if (SCORE_32BIT_MAX - new_score >
        (input_scores[key_idx] & SCORE_BITS_MASK)) {
      new_score +=
          (make_epoch<S>(epoch) | (input_scores[key_idx] & SCORE_BITS_MASK));
    } else {
      new_score = make_epoch<S>(epoch) | SCORE_32BIT_MAX;
    }

    bucket->scores(key_pos)->store(new_score, cuda::std::memory_order_relaxed);
    return;
  }

  __forceinline__ __device__ static void update_without_missed(
      K* bucket_keys_ptr, const uint32_t bucket_capacity,
      const uint32_t key_pos, const S* __restrict const input_scores,
      const int key_idx, const S& epoch) {
    if (input_scores == nullptr) return;
    S* dst_score_ptr =
        BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);
    S new_score = *dst_score_ptr & SCORE_BITS_MASK;
    if (SCORE_32BIT_MAX - new_score >
        (input_scores[key_idx] & SCORE_BITS_MASK)) {
      new_score +=
          (make_epoch<S>(epoch) | (input_scores[key_idx] & SCORE_BITS_MASK));
    } else {
      new_score = make_epoch<S>(epoch) | SCORE_32BIT_MAX;
    }
    // Cache in L2 cache, bypass L1 Cache.
    __stcg(dst_score_ptr, new_score);
  }
};

template <class K, class V, class S>
struct ScoreFunctor<K, V, S, EvictStrategyInternal::kCustomized> {
  static constexpr cuda::std::memory_order LOCK_MEM_ORDER =
      cuda::std::memory_order_acquire;
  static constexpr cuda::std::memory_order UNLOCK_MEM_ORDER =
      cuda::std::memory_order_release;
  using BUCKET = Bucket<K, V, S>;

  __forceinline__ __device__ static S desired_when_missed(
      const S* __restrict const input_scores, const int key_idx,
      const S& epoch) {
    return input_scores[key_idx];
  }

  __forceinline__ __device__ static void update(
      BUCKET* __restrict bucket, const int key_pos,
      const S* __restrict const input_scores, const int key_idx,
      const S& desired_score_when_missed, const bool new_insert) {
    bucket->scores(key_pos)->store(desired_score_when_missed,
                                   cuda::std::memory_order_relaxed);
    return;
  }

  __forceinline__ __device__ static void update_with_digest(
      K* __restrict bucket_key_ptr, const uint32_t& key_pos,
      const S* __restrict const input_scores, const uint32_t& key_idx,
      const S& desired_score_when_missed, const uint32_t& bucket_capacity,
      const D& digest, const bool new_insert) {
    S* dst_score_ptr = BUCKET::scores(bucket_key_ptr, bucket_capacity, key_pos);
    D* dst_digest_ptr =
        BUCKET::digests(bucket_key_ptr, bucket_capacity, key_pos);
    // Cache in L2 cache, bypass L1 Cache.
    __stcg(dst_digest_ptr, digest);
    __stcg(dst_score_ptr, desired_score_when_missed);
    return;
  }
  __forceinline__ __device__ static void update_without_missed(
      BUCKET* __restrict bucket, const int key_pos,
      const S* __restrict const input_scores, const int key_idx,
      const S& epoch) {
    if (input_scores == nullptr) return;
    bucket->scores(key_pos)->store(input_scores[key_idx],
                                   cuda::std::memory_order_relaxed);
    return;
  }

  __forceinline__ __device__ static void update_without_missed(
      K* bucket_keys_ptr, const uint32_t bucket_capacity,
      const uint32_t key_pos, const S* __restrict const input_scores,
      const int key_idx, const S& epoch) {
    if (input_scores == nullptr) return;
    S* dst_score_ptr =
        BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);
    // Cache in L2 cache, bypass L1 Cache.
    __stcg(dst_score_ptr, input_scores[key_idx]);
  }
};

template <class V, uint32_t TILE_SIZE = 4>
__device__ __forceinline__ void copy_vector(
    cg::thread_block_tile<TILE_SIZE> const& g, const V* src, V* dst,
    const size_t dim) {
  for (auto i = g.thread_rank(); i < dim; i += g.size()) {
    dst[i] = src[i];
  }
}

template <class K, class V, class S>
__forceinline__ __device__ Bucket<K, V, S>* get_key_position(
    Bucket<K, V, S>* __restrict buckets, const K key, size_t& bkt_idx,
    size_t& start_idx, const size_t buckets_num, const size_t bucket_max_size) {
  const K hashed_key = Murmur3HashDevice(key);
  const size_t global_idx = hashed_key % (buckets_num * bucket_max_size);
  bkt_idx = global_idx / bucket_max_size;
  start_idx = global_idx % bucket_max_size;
  start_idx -= start_idx % 4;
  return buckets + bkt_idx;
}

__forceinline__ __device__ uint32_t get_start_position(
    const uint64_t& global_idx, const uint64_t& bucket_capacity) {
  uint32_t start_idx = global_idx & (bucket_capacity - 1);
  start_idx -= start_idx % 4;
  return start_idx;
}

template <class K, class V, class S, uint32_t TILE_SIZE = 4>
__device__ __forceinline__ OccupyResult find_without_lock(
    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, S>* __restrict__ bucket,
    const K desired_key, const size_t start_idx, int& key_pos, int& src_lane,
    const size_t bucket_max_size) {
  K expected_key = static_cast<K>(EMPTY_KEY);

  AtomicKey<K>* current_key;

  unsigned vote = 0;

  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
       tile_offset += TILE_SIZE) {
    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;

    current_key = bucket->keys(key_pos);

    expected_key = current_key->load(cuda::std::memory_order_relaxed);
    vote = g.ballot(desired_key == expected_key);
    if (vote) {
      src_lane = __ffs(vote) - 1;
      key_pos = g.shfl(key_pos, src_lane);
      return OccupyResult::DUPLICATE;
    }
    vote = g.ballot(expected_key == static_cast<K>(EMPTY_KEY));
    if (vote) break;
  }
  return OccupyResult::CONTINUE;
}

template <class K, class V, class S, uint32_t TILE_SIZE = 4>
__device__ __inline__ OccupyResult find_and_lock_when_vacant(
    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, S>* __restrict__ bucket,
    const K desired_key, const S desired_score, K& evicted_key,
    const size_t start_idx, int& key_pos, int& src_lane,
    const size_t bucket_max_size) {
  K expected_key = static_cast<K>(EMPTY_KEY);

  AtomicKey<K>* current_key;
  AtomicScore<S>* current_score;

  K local_min_score_key = static_cast<K>(EMPTY_KEY);

  S local_min_score_val = static_cast<S>(MAX_SCORE);
  S temp_min_score_val = static_cast<S>(MAX_SCORE);
  int local_min_score_pos = -1;

  unsigned vote = 0;
  bool result = false;

  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
       tile_offset += TILE_SIZE) {
    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;

    current_key = bucket->keys(key_pos);

    // Step 1: try find and lock the desired_key.
    do {
      expected_key = desired_key;
      result = current_key->compare_exchange_strong(
          expected_key, static_cast<K>(LOCKED_KEY),
          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
      vote = g.ballot(result);
      if (vote) {
        src_lane = __ffs(vote) - 1;
        key_pos = g.shfl(key_pos, src_lane);
        return OccupyResult::DUPLICATE;
      }
      vote = g.ballot(expected_key == static_cast<K>(LOCKED_KEY));
      if (vote) continue;
      vote = g.ballot(expected_key == static_cast<K>(EMPTY_KEY));
      if (vote) break;
    } while (vote != 0);

    // Step 2: (TBD)try find empty location.
    while (vote) {
      src_lane = __ffs(vote) - 1;
      if (src_lane == g.thread_rank()) {
        expected_key = static_cast<K>(EMPTY_KEY);
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
      }
      result = g.shfl(result, src_lane);
      if (result) {
        key_pos = g.shfl(key_pos, src_lane);
        return OccupyResult::OCCUPIED_EMPTY;
      }
      result = g.shfl((expected_key == desired_key ||
                       expected_key == static_cast<K>(LOCKED_KEY)),
                      src_lane);
      if (result) {
        return OccupyResult::CONTINUE;
      }
      vote -= ((unsigned(0x1)) << src_lane);
    }
  }

  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
       tile_offset += TILE_SIZE) {
    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;

    current_score = bucket->scores(key_pos);

    // Step 4: record min score location.
    temp_min_score_val = current_score->load(cuda::std::memory_order_relaxed);
    if (temp_min_score_val < local_min_score_val) {
      expected_key =
          bucket->keys(key_pos)->load(cuda::std::memory_order_relaxed);
      if (expected_key != static_cast<K>(LOCKED_KEY) &&
          expected_key != static_cast<K>(EMPTY_KEY)) {
        local_min_score_key = expected_key;
        local_min_score_val = temp_min_score_val;
        local_min_score_pos = key_pos;
      }
    }
  }
  // Step 5: insert by evicting some one.
  const S global_min_score_val =
      cg::reduce(g, local_min_score_val, cg::less<S>());
  if (desired_score < global_min_score_val) {
    return OccupyResult::REFUSED;
  }
  vote = g.ballot(local_min_score_val <= global_min_score_val);
  if (vote) {
    src_lane = __ffs(vote) - 1;
    result = false;
    if (src_lane == g.thread_rank()) {
      // TBD: Here can be compare_exchange_weak. Do benchmark.
      current_key = bucket->keys(local_min_score_pos);
      current_score = bucket->scores(local_min_score_pos);
      evicted_key = local_min_score_key;
      result = current_key->compare_exchange_strong(
          local_min_score_key, static_cast<K>(LOCKED_KEY),
          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);

      // Need to recover when fail.
      if (result && (current_score->load(cuda::std::memory_order_relaxed) >
                     global_min_score_val)) {
        current_key->store(local_min_score_key,
                           cuda::std::memory_order_release);
        result = false;
      }
    }
    result = g.shfl(result, src_lane);
    if (result) {
      // Not every `evicted_key` is correct expect the `src_lane` thread.
      key_pos = g.shfl(local_min_score_pos, src_lane);
      return (evicted_key == static_cast<K>(RECLAIM_KEY))
                 ? OccupyResult::OCCUPIED_RECLAIMED
                 : OccupyResult::EVICT;
    }
  }
  return OccupyResult::CONTINUE;
}

template <class K, class V, class S, uint32_t TILE_SIZE,
          cuda::std::memory_order LOCK_MEM_ORDER,
          cuda::std::memory_order UNLOCK_MEM_ORDER>
__device__ __forceinline__ OccupyResult find_and_lock_when_full(
    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, S>* __restrict__ bucket,
    const K desired_key, const S desired_score, K& evicted_key,
    const size_t start_idx, int& key_pos, int& src_lane,
    const size_t bucket_max_size) {
  K expected_key = static_cast<K>(EMPTY_KEY);

  AtomicKey<K>* current_key;
  AtomicScore<S>* current_score;

  K local_min_score_key = static_cast<K>(EMPTY_KEY);

  S local_min_score_val = static_cast<S>(MAX_SCORE);
  S temp_min_score_val = static_cast<S>(MAX_SCORE);
  int local_min_score_pos = -1;

  unsigned vote = 0;
  bool result = false;

  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
       tile_offset += TILE_SIZE) {
    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;

    current_key = bucket->keys(key_pos);

    // Step 1: try find and lock the desired_key.
    do {
      expected_key = desired_key;
      result = current_key->compare_exchange_strong(
          expected_key, static_cast<K>(LOCKED_KEY), LOCK_MEM_ORDER,
          cuda::std::memory_order_relaxed);
      vote = g.ballot(result);
      if (vote) {
        src_lane = __ffs(vote) - 1;
        key_pos = g.shfl(key_pos, src_lane);
        return OccupyResult::DUPLICATE;
      }
      vote = g.ballot(expected_key == static_cast<K>(LOCKED_KEY));
    } while (vote != 0);
  }

  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
       tile_offset += TILE_SIZE) {
    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;

    // Step 2: record min score location.
    temp_min_score_val =
        bucket->scores(key_pos)->load(cuda::std::memory_order_relaxed);
    if (temp_min_score_val < local_min_score_val) {
      while ((expected_key = bucket->keys(key_pos)->load(LOCK_MEM_ORDER)) ==
             static_cast<K>(LOCKED_KEY)) {
      };
      local_min_score_key = expected_key;
      local_min_score_val = temp_min_score_val;
      local_min_score_pos = key_pos;
    }
  }

  // Step 3: insert by evicting some one.
  const S global_min_score_val =
      cg::reduce(g, local_min_score_val, cg::less<S>());
  if (desired_score < global_min_score_val) {
    return OccupyResult::REFUSED;
  }
  vote = g.ballot(local_min_score_val <= global_min_score_val);
  if (vote) {
    src_lane = __ffs(vote) - 1;
    result = false;
    if (src_lane == g.thread_rank()) {
      // TBD: Here can be compare_exchange_weak. Do benchmark.
      current_key = bucket->keys(local_min_score_pos);
      current_score = bucket->scores(local_min_score_pos);
      evicted_key = local_min_score_key;
      result = current_key->compare_exchange_strong(
          local_min_score_key, static_cast<K>(LOCKED_KEY), LOCK_MEM_ORDER,
          cuda::std::memory_order_relaxed);

      // Need to recover when fail.
      if (result && (current_score->load(cuda::std::memory_order_relaxed) >
                     global_min_score_val)) {
        current_key->store(local_min_score_key, UNLOCK_MEM_ORDER);
        result = false;
      }
    }
    result = g.shfl(result, src_lane);
    if (result) {
      // Not every `evicted_key` is correct expect the `src_lane` thread.
      key_pos = g.shfl(local_min_score_pos, src_lane);
      return (evicted_key == static_cast<K>(RECLAIM_KEY))
                 ? OccupyResult::OCCUPIED_RECLAIMED
                 : OccupyResult::EVICT;
    }
  }
  return OccupyResult::CONTINUE;
}

template <class K, class V, class S, uint32_t TILE_SIZE>
__device__ __forceinline__ OccupyResult find_and_lock_for_update(
    cg::thread_block_tile<TILE_SIZE> g, Bucket<K, V, S>* __restrict__ bucket,
    const K desired_key, const size_t start_idx, int& key_pos, int& src_lane,
    const size_t bucket_max_size) {
  K expected_key = static_cast<K>(EMPTY_KEY);

  AtomicKey<K>* current_key;

  unsigned vote = 0;
  bool result = false;

  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
       tile_offset += TILE_SIZE) {
    key_pos = (start_idx + tile_offset + g.thread_rank()) % bucket_max_size;

    current_key = bucket->keys(key_pos);

    // Step 1: try find and lock the desired_key.
    do {
      expected_key = desired_key;
      result = current_key->compare_exchange_strong(
          expected_key, static_cast<K>(LOCKED_KEY),
          cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
      vote = g.ballot(result);
      if (vote) {
        src_lane = __ffs(vote) - 1;
        key_pos = g.shfl(key_pos, src_lane);
        return OccupyResult::DUPLICATE;
      }
      vote = g.ballot(expected_key == static_cast<K>(EMPTY_KEY));
      if (vote) return OccupyResult::REFUSED;
      vote = g.ballot(expected_key == static_cast<K>(LOCKED_KEY));
    } while (vote != 0);
  }
  return OccupyResult::REFUSED;
}

}  // namespace merlin
}  // namespace nv


================================================
FILE: include/merlin/core_kernels/lookup.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "kernel_utils.cuh"

namespace nv {
namespace merlin {

template <typename K>
struct FoundFunctorV1 {
  __host__ __device__ FoundFunctorV1(bool* __restrict founds_)
      : founds(founds_) {}

  __forceinline__ __device__ void operator()(const int idx, const K /*key*/,
                                             const bool found) {
    if (found) {
      founds[idx] = true;
    }
  }

  bool* __restrict founds;
};

template <typename K>
struct FoundFunctorV2 {
  __host__ __device__ FoundFunctorV2(K* __restrict missed_keys_,
                                     int* __restrict missed_indices_,
                                     int* __restrict missed_size_)
      : missed_keys(missed_keys_),
        missed_indices(missed_indices_),
        missed_size(missed_size_) {}

  __forceinline__ __device__ void operator()(const int idx, const K key,
                                             const bool found) {
    if (!found) {
      int missed_idx = atomicAdd(missed_size, 1);
      missed_keys[missed_idx] = key;
      missed_indices[missed_idx] = idx;
    }
  }

  K* __restrict missed_keys;
  int* __restrict missed_indices;
  int* __restrict missed_size;
};

template <typename K = uint64_t, typename V = float, typename S = uint64_t>
struct LookupKernelParams {
  LookupKernelParams(Bucket<K, V, S>* __restrict buckets_, size_t buckets_num_,
                     uint32_t dim_, const K* __restrict keys_,
                     V* __restrict values_, S* __restrict scores_,
                     bool* __restrict founds_, size_t n_)
      : buckets(buckets_),
        buckets_num(buckets_num_),
        dim(dim_),
        keys(keys_),
        values(values_),
        scores(scores_),
        found_functor(founds_),
        n(n_) {}
  Bucket<K, V, S>* __restrict buckets;
  size_t buckets_num;
  uint32_t dim;
  const K* __restrict keys;
  V* __restrict values;
  S* __restrict scores;
  FoundFunctorV1<K> found_functor;
  size_t n;
};

template <typename K = uint64_t, typename V = float, typename S = uint64_t>
struct LookupKernelParamsV2 {
  LookupKernelParamsV2(Bucket<K, V, S>* __restrict buckets_,
                       size_t buckets_num_, uint32_t dim_,
                       const K* __restrict keys_, V* __restrict values_,
                       S* __restrict scores_, K* __restrict missed_keys_,
                       int* __restrict missed_indices_,
                       int* __restrict missed_size_, size_t n_)
      : buckets(buckets_),
        buckets_num(buckets_num_),
        dim(dim_),
        keys(keys_),
        values(values_),
        scores(scores_),
        found_functor(missed_keys_, missed_indices_, missed_size_),
        n(n_) {}

  Bucket<K, V, S>* __restrict buckets;
  size_t buckets_num;
  uint32_t dim;
  const K* __restrict keys;
  V* __restrict values;
  S* __restrict scores;
  FoundFunctorV2<K> found_functor;
  size_t n;
};

// Using 32 threads to deal with one key
template <typename K = uint64_t, typename V = float, typename S = uint64_t,
          typename VecV = float4,
          typename CopyScore = CopyScoreEmpty<S, K, 128>,
          typename CopyValue = CopyValueTwoGroup<VecV, 32>,
          typename FoundFunctor = FoundFunctorV1<K>, int VALUE_BUF = 56>
__global__ void lookup_kernel_with_io_pipeline_v1(
    Bucket<K, V, S>* buckets, const size_t buckets_num, const int dim,
    const K* __restrict keys, VecV* __restrict values, S* __restrict scores,
    FoundFunctor found_functor, size_t n) {
  constexpr int GROUP_SIZE = 32;
  constexpr int RESERVE = 16;
  constexpr int BLOCK_SIZE = 128;
  constexpr int BUCKET_SIZE = 128;
  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;
  constexpr int DIGEST_SPAN = BUCKET_SIZE / 4;

  __shared__ int sm_target_digests[BLOCK_SIZE];
  __shared__ K sm_target_keys[BLOCK_SIZE];
  __shared__ K* sm_keys_ptr[BLOCK_SIZE];
  __shared__ VecV* sm_values_ptr[BLOCK_SIZE];
  __shared__ S sm_target_scores[BLOCK_SIZE];
  // Reuse
  int* sm_counts = sm_target_digests;
  int* sm_founds = sm_counts;
  // Double buffer
  __shared__ uint32_t sm_probing_digests[2][GROUP_NUM * DIGEST_SPAN];
  __shared__ K sm_possible_keys[2][GROUP_NUM * RESERVE];
  __shared__ int sm_possible_pos[2][GROUP_NUM * RESERVE];
  __shared__ VecV sm_vector[2][GROUP_NUM][VALUE_BUF];

  // Initialization
  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());
  int groupID = threadIdx.x / GROUP_SIZE;
  int rank = g.thread_rank();
  int key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;
  if (key_idx_base >= n) return;
  int loop_num =
      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;
  if (rank < loop_num) {
    int idx_block = groupID * GROUP_SIZE + rank;
    K target_key = keys[key_idx_base + rank];
    sm_target_keys[idx_block] = target_key;
    const K hashed_key = Murmur3HashDevice(target_key);
    const uint8_t target_digest = static_cast<uint8_t>(hashed_key >> 32);
    sm_target_digests[idx_block] = static_cast<uint32_t>(target_digest);
    int global_idx = hashed_key % (buckets_num * BUCKET_SIZE);
    int bkt_idx = global_idx / BUCKET_SIZE;
    Bucket<K, V, S>* bucket = buckets + bkt_idx;
    __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(),
                            sizeof(K*));
    __pipeline_commit();
    __pipeline_memcpy_async(sm_values_ptr + idx_block, &(bucket->vectors),
                            sizeof(VecV*));
  }
  __pipeline_wait_prior(0);

  // Pipeline loading
  uint8_t* digests_ptr =
      reinterpret_cast<uint8_t*>(sm_keys_ptr[groupID * GROUP_SIZE]) -
      BUCKET_SIZE;
  __pipeline_memcpy_async(sm_probing_digests[0] + groupID * DIGEST_SPAN + rank,
                          digests_ptr + rank * 4, sizeof(uint32_t));
  __pipeline_commit();
  // Padding, meet the param of the first `__pipeline_wait_prior`
  // in the first loop.
  __pipeline_commit();
  __pipeline_commit();

  for (int i = 0; i < loop_num; i++) {
    int key_idx_block = groupID * GROUP_SIZE + i;

    /* Step1: prefetch all digests in one bucket */
    if ((i + 1) < loop_num) {
      uint8_t* digests_ptr =
          reinterpret_cast<uint8_t*>(sm_keys_ptr[key_idx_block + 1]) -
          BUCKET_SIZE;
      __pipeline_memcpy_async(
          sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank,
          digests_ptr + rank * 4, sizeof(uint32_t));
    }
    __pipeline_commit();

    /* Step2: check digests and load possible keys */
    uint32_t target_digest = sm_target_digests[key_idx_block];
    uint32_t target_digests = __byte_perm(target_digest, target_digest, 0x0000);
    sm_counts[key_idx_block] = 0;
    __pipeline_wait_prior(3);
    uint32_t probing_digests =
        sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank];
    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests);
    uint32_t find_result = 0;
    if ((find_result_ & 0x01) != 0) find_result |= 0x01;
    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;
    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;
    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;
    int find_number = __popc(find_result);
    int group_base = 0;
    if (find_number > 0) {
      group_base = atomicAdd(sm_counts + key_idx_block, find_number);
    }
    bool gt_reserve = (group_base + find_number) > RESERVE;
    int gt_vote = g.ballot(gt_reserve);
    K* key_ptr = sm_keys_ptr[key_idx_block];
    if (gt_vote == 0) {
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = rank * 4 + digest_idx;
          sm_possible_pos[same_buf(i)][groupID * RESERVE + group_base] =
              key_pos;
          __pipeline_memcpy_async(
              sm_possible_keys[same_buf(i)] + (groupID * RESERVE + group_base),
              key_ptr + key_pos, sizeof(K));
          group_base += 1;
        } else {
          break;
        }
      } while (true);
    } else {
      K target_key = sm_target_keys[key_idx_block];
      sm_counts[key_idx_block] = 0;
      int found_vote = 0;
      bool found = false;
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = rank * 4 + digest_idx;
          K possible_key = key_ptr[key_pos];
          if (possible_key == target_key) {
            found = true;
            sm_counts[key_idx_block] = 1;
            sm_possible_pos[same_buf(i)][groupID * RESERVE] = key_pos;
            sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key;
          }
        }
        found_vote = g.ballot(found);
        if (found_vote) {
          break;
        }
        found_vote = digest_idx >= 0;
      } while (g.any(found_vote));
    }
    __pipeline_commit();

    /* Step3: check possible keys, and prefecth the value and score */
    if (i > 0) {
      key_idx_block -= 1;
      K target_key = sm_target_keys[key_idx_block];
      int possible_num = sm_counts[key_idx_block];
      sm_founds[key_idx_block] = 0;
      S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr, key_idx_block);
      VecV* value_ptr = sm_values_ptr[key_idx_block];
      __pipeline_wait_prior(3);
      int key_pos;
      bool found_flag = false;
      if (rank < possible_num) {
        K possible_key =
            sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank];
        key_pos = sm_possible_pos[diff_buf(i)][groupID * RESERVE + rank];
        if (possible_key == target_key) {
          found_flag = true;
          CopyScore::ldg_sts(sm_target_scores + key_idx_block,
                             score_ptr + key_pos);
        }
      }
      int found_vote = g.ballot(found_flag);
      if (found_vote) {
        VecV* v_dst = sm_vector[diff_buf(i)][groupID];
        sm_founds[key_idx_block] = 1;
        int src_lane = __ffs(found_vote) - 1;
        int target_pos = g.shfl(key_pos, src_lane);
        VecV* v_src = value_ptr + target_pos * dim;
        CopyValue::ldg_sts(rank, v_dst, v_src, dim);
      }
    }
    __pipeline_commit();

    /* Step4: write back value and score */
    if (i > 1) {
      key_idx_block -= 1;
      int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
      VecV* v_src = sm_vector[same_buf(i)][groupID];
      VecV* v_dst = values + key_idx_grid * dim;
      int found_flag = sm_founds[key_idx_block];
      __pipeline_wait_prior(3);
      if (found_flag > 0) {
        S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);
        CopyValue::lds_stg(rank, v_dst, v_src, dim);
        CopyScore::stg(scores + key_idx_grid, score_);
      }
    }
  }  // End loop

  /* Pipeline emptying: step3, i = loop_num */
  {
    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);
    K target_key = sm_target_keys[key_idx_block];
    int possible_num = sm_counts[key_idx_block];
    sm_founds[key_idx_block] = 0;
    S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr, key_idx_block);
    VecV* value_ptr = sm_values_ptr[key_idx_block];
    __pipeline_wait_prior(1);
    int key_pos;
    bool found_flag = false;
    if (rank < possible_num) {
      key_pos = sm_possible_pos[diff_buf(loop_num)][groupID * RESERVE + rank];
      K possible_key =
          sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank];
      if (target_key == possible_key) {
        found_flag = true;
        CopyScore::ldg_sts(sm_target_scores + key_idx_block,
                           score_ptr + key_pos);
      }
    }
    int found_vote = g.ballot(found_flag);
    if (found_vote) {
      sm_founds[key_idx_block] = 1;
      int src_lane = __ffs(found_vote) - 1;
      int target_pos = g.shfl(key_pos, src_lane);
      VecV* v_src = value_ptr + target_pos * dim;
      VecV* v_dst = sm_vector[diff_buf(loop_num)][groupID];
      CopyValue::ldg_sts(rank, v_dst, v_src, dim);
    }
  }
  __pipeline_commit();

  /* Pipeline emptying: step4, i = loop_num */
  if (loop_num > 1) {
    int key_idx_block = groupID * GROUP_SIZE + loop_num - 2;
    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
    VecV* v_src = sm_vector[same_buf(loop_num)][groupID];
    VecV* v_dst = values + key_idx_grid * dim;
    int found_flag = sm_founds[key_idx_block];
    __pipeline_wait_prior(1);
    if (found_flag > 0) {
      S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);
      CopyValue::lds_stg(rank, v_dst, v_src, dim);
      CopyScore::stg(scores + key_idx_grid, score_);
    }
  }

  /* Pipeline emptying: step4, i = loop_num + 1 */
  {
    int key_idx_block = groupID * GROUP_SIZE + loop_num - 1;
    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
    VecV* v_src = sm_vector[same_buf(loop_num + 1)][groupID];
    VecV* v_dst = values + key_idx_grid * dim;
    int found_flag = sm_founds[key_idx_block];
    __pipeline_wait_prior(0);
    if (found_flag > 0) {
      S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);
      CopyValue::lds_stg(rank, v_dst, v_src, dim);
      CopyScore::stg(scores + key_idx_grid, score_);
    }
  }

  if (rank < loop_num) {
    int key_idx_block = groupID * GROUP_SIZE + rank;
    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
    found_functor(key_idx_grid, sm_target_keys[key_idx_block],
                  sm_founds[key_idx_block]);
  }
}  // End function

// Using 16 threads to deal with one key
template <typename K = uint64_t, typename V = float, typename S = uint64_t,
          typename VecV = float4,
          typename CopyScore = CopyScoreEmpty<S, K, 128>,
          typename CopyValue = CopyValueTwoGroup<VecV, 16>,
          typename FoundFunctor = FoundFunctorV1<K>, int VALUE_BUF = 32>
__global__ void lookup_kernel_with_io_pipeline_v2(
    Bucket<K, V, S>* buckets, const size_t buckets_num, const int dim,
    const K* __restrict keys, VecV* __restrict values, S* __restrict scores,
    FoundFunctor found_functor, size_t n) {
  constexpr int GROUP_SIZE = 16;
  constexpr int RESERVE = 8;
  constexpr int BLOCK_SIZE = 128;
  constexpr int BUCKET_SIZE = 128;
  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;
  constexpr int DIGEST_SPAN = BUCKET_SIZE / 4;

  __shared__ int sm_target_digests[BLOCK_SIZE];
  __shared__ K sm_target_keys[BLOCK_SIZE];
  __shared__ K* sm_keys_ptr[BLOCK_SIZE];
  __shared__ VecV* sm_values_ptr[BLOCK_SIZE];
  __shared__ S sm_target_scores[BLOCK_SIZE];
  // Reuse
  int* sm_counts = sm_target_digests;
  int* sm_founds = sm_counts;
  // Double buffer
  __shared__ uint32_t sm_probing_digests[2][GROUP_NUM * DIGEST_SPAN];
  __shared__ K sm_possible_keys[2][GROUP_NUM * RESERVE];
  __shared__ int sm_possible_pos[2][GROUP_NUM * RESERVE];
  __shared__ VecV sm_vector[2][GROUP_NUM][VALUE_BUF];

  // Initialization
  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());
  int groupID = threadIdx.x / GROUP_SIZE;
  int rank = g.thread_rank();
  int key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;
  if (key_idx_base >= n) return;
  int loop_num =
      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;
  if (rank < loop_num) {
    int idx_block = groupID * GROUP_SIZE + rank;
    K target_key = keys[key_idx_base + rank];
    sm_target_keys[idx_block] = target_key;
    const K hashed_key = Murmur3HashDevice(target_key);
    const uint8_t target_digest = static_cast<uint8_t>(hashed_key >> 32);
    sm_target_digests[idx_block] = static_cast<uint32_t>(target_digest);
    int global_idx = hashed_key % (buckets_num * BUCKET_SIZE);
    int bkt_idx = global_idx / BUCKET_SIZE;
    Bucket<K, V, S>* bucket = buckets + bkt_idx;
    __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(),
                            sizeof(K*));
    __pipeline_commit();
    __pipeline_memcpy_async(sm_values_ptr + idx_block, &(bucket->vectors),
                            sizeof(VecV*));
  }
  __pipeline_wait_prior(0);

  // Pipeline loading
  uint8_t* digests_ptr =
      reinterpret_cast<uint8_t*>(sm_keys_ptr[groupID * GROUP_SIZE]) -
      BUCKET_SIZE;
  __pipeline_memcpy_async(
      sm_probing_digests[0] + groupID * DIGEST_SPAN + rank * 2,
      digests_ptr + rank * 8, sizeof(uint2));
  __pipeline_commit();
  // Padding, meet the param of the first `__pipeline_wait_prior`
  // in the first loop.
  __pipeline_commit();
  __pipeline_commit();

  for (int i = 0; i < loop_num; i++) {
    int key_idx_block = groupID * GROUP_SIZE + i;

    /* Step1: prefetch all digests in one bucket */
    if ((i + 1) < loop_num) {
      uint8_t* digests_ptr =
          reinterpret_cast<uint8_t*>(sm_keys_ptr[key_idx_block + 1]) -
          BUCKET_SIZE;
      __pipeline_memcpy_async(
          sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank * 2,
          digests_ptr + rank * 8, sizeof(uint2));
    }
    __pipeline_commit();

    /* Step2: check digests and load possible keys */
    uint32_t target_digest = sm_target_digests[key_idx_block];
    uint32_t target_digests = __byte_perm(target_digest, target_digest, 0x0000);
    sm_counts[key_idx_block] = 0;
    __pipeline_wait_prior(3);
    uint32_t probing_digests =
        sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank];
    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests);
    uint32_t find_result = 0;
    if ((find_result_ & 0x01) != 0) find_result |= 0x01;
    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;
    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;
    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;
    probing_digests = sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN +
                                                      rank + GROUP_SIZE];
    find_result_ = __vcmpeq4(probing_digests, target_digests);
    if ((find_result_ & 0x01) != 0) find_result |= 0x10;
    if ((find_result_ & 0x0100) != 0) find_result |= 0x20;
    if ((find_result_ & 0x010000) != 0) find_result |= 0x40;
    if ((find_result_ & 0x01000000) != 0) find_result |= 0x80;
    int find_number = __popc(find_result);
    int group_base = 0;
    if (find_number > 0) {
      group_base = atomicAdd(sm_counts + key_idx_block, find_number);
    }
    bool gt_reserve = (group_base + find_number) > RESERVE;
    int gt_vote = g.ballot(gt_reserve);
    K* key_ptr = sm_keys_ptr[key_idx_block];
    if (gt_vote == 0) {
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = digest_idx < 4
                            ? (rank * 4 + digest_idx)
                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);
          sm_possible_pos[same_buf(i)][groupID * RESERVE + group_base] =
              key_pos;
          __pipeline_memcpy_async(
              sm_possible_keys[same_buf(i)] + groupID * RESERVE + group_base,
              key_ptr + key_pos, sizeof(K));
          group_base += 1;
        } else {
          break;
        }
      } while (true);
    } else {
      K target_key = sm_target_keys[key_idx_block];
      sm_counts[key_idx_block] = 0;
      int found_vote = 0;
      bool found = false;
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = digest_idx < 4
                            ? (rank * 4 + digest_idx)
                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);
          K possible_key = key_ptr[key_pos];
          if (possible_key == target_key) {
            found = true;
            sm_counts[key_idx_block] = 1;
            sm_possible_pos[same_buf(i)][groupID * RESERVE] = key_pos;
            sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key;
          }
        }
        found_vote = g.ballot(found);
        if (found_vote) {
          break;
        }
        found_vote = digest_idx >= 0;
      } while (g.any(found_vote));
    }
    __pipeline_commit();

    /* Step3: check possible keys, and prefecth the value and score */
    if (i > 0) {
      key_idx_block -= 1;
      K target_key = sm_target_keys[key_idx_block];
      int possible_num = sm_counts[key_idx_block];
      sm_founds[key_idx_block] = 0;
      S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr, key_idx_block);
      VecV* value_ptr = sm_values_ptr[key_idx_block];
      __pipeline_wait_prior(3);
      int key_pos;
      bool found_flag = false;
      if (rank < possible_num) {
        K possible_key =
            sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank];
        key_pos = sm_possible_pos[diff_buf(i)][groupID * RESERVE + rank];
        if (possible_key == target_key) {
          found_flag = true;
          CopyScore::ldg_sts(sm_target_scores + key_idx_block,
                             score_ptr + key_pos);
        }
      }
      int found_vote = g.ballot(found_flag);
      if (found_vote) {
        sm_founds[key_idx_block] = 1;
        int src_lane = __ffs(found_vote) - 1;
        int target_pos = g.shfl(key_pos, src_lane);
        VecV* v_src = value_ptr + target_pos * dim;
        VecV* v_dst = sm_vector[diff_buf(i)][groupID];
        CopyValue::ldg_sts(rank, v_dst, v_src, dim);
      }
    }
    __pipeline_commit();

    /* Step4: write back value and score */
    if (i > 1) {
      key_idx_block -= 1;
      int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
      int found_flag = sm_founds[key_idx_block];
      VecV* v_src = sm_vector[same_buf(i)][groupID];
      VecV* v_dst = values + key_idx_grid * dim;
      __pipeline_wait_prior(3);
      if (found_flag > 0) {
        S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);
        CopyValue::lds_stg(rank, v_dst, v_src, dim);
        CopyScore::stg(scores + key_idx_grid, score_);
      }
    }
  }  // End loop

  /* Pipeline emptying: step3, i = loop_num */
  {
    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);
    K target_key = sm_target_keys[key_idx_block];
    int possible_num = sm_counts[key_idx_block];
    sm_founds[key_idx_block] = 0;
    S* score_ptr = CopyScore::get_base_ptr(sm_keys_ptr, key_idx_block);
    VecV* value_ptr = sm_values_ptr[key_idx_block];
    __pipeline_wait_prior(1);
    int key_pos;
    bool found_flag = false;
    if (rank < possible_num) {
      key_pos = sm_possible_pos[diff_buf(loop_num)][groupID * RESERVE + rank];
      K possible_key =
          sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank];
      if (possible_key == target_key) {
        found_flag = true;
        CopyScore::ldg_sts(sm_target_scores + key_idx_block,
                           score_ptr + key_pos);
      }
    }
    int found_vote = g.ballot(found_flag);
    if (found_vote) {
      sm_founds[key_idx_block] = 1;
      int src_lane = __ffs(found_vote) - 1;
      int target_pos = g.shfl(key_pos, src_lane);
      VecV* v_src = value_ptr + target_pos * dim;
      VecV* v_dst = sm_vector[diff_buf(loop_num)][groupID];
      CopyValue::ldg_sts(rank, v_dst, v_src, dim);
    }
  }
  __pipeline_commit();

  /* Pipeline emptying: step4, i = loop_num */
  if (loop_num > 1) {
    int key_idx_block = groupID * GROUP_SIZE + loop_num - 2;
    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
    VecV* v_dst = values + key_idx_grid * dim;
    VecV* v_src = sm_vector[same_buf(loop_num)][groupID];
    int found_flag = sm_founds[key_idx_block];
    __pipeline_wait_prior(1);
    if (found_flag > 0) {
      S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);
      CopyValue::lds_stg(rank, v_dst, v_src, dim);
      CopyScore::stg(scores + key_idx_grid, score_);
    }
  }

  /* Pipeline emptying: step4, i = loop_num + 1 */
  {
    int key_idx_block = groupID * GROUP_SIZE + loop_num - 1;
    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
    VecV* v_dst = values + key_idx_grid * dim;
    VecV* v_src = sm_vector[same_buf(loop_num + 1)][groupID];
    int found_flag = sm_founds[key_idx_block];
    __pipeline_wait_prior(0);
    if (found_flag > 0) {
      S score_ = CopyScore::lgs(sm_target_scores + key_idx_block);
      CopyValue::lds_stg(rank, v_dst, v_src, dim);
      CopyScore::stg(scores + key_idx_grid, score_);
    }
  }

  if (rank < loop_num) {
    int key_idx_block = groupID * GROUP_SIZE + rank;
    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
    found_functor(key_idx_grid, sm_target_keys[key_idx_block],
                  sm_founds[key_idx_block]);
  }
}  // End function

template <typename K, typename V, typename S, typename CopyScore, typename VecV,
          uint32_t ValueBufSize>
struct LaunchPipelineLookupV1 {
  template <template <typename, typename, typename> typename LookupKernelParams>
  static void launch_kernel(LookupKernelParams<K, V, S>& params,
                            cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    // Using 32 threads to deal with one key
    constexpr int GROUP_SIZE = 32;
    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    constexpr uint32_t VecSize = ValueBufSize / sizeof(VecV);
    if (params.dim > (GROUP_SIZE * 2)) {
      using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;
      lookup_kernel_with_io_pipeline_v1<K, V, S, VecV, CopyScore, CopyValue,
                                        decltype(params.found_functor), VecSize>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, params.buckets_num, params.dim, params.keys,
              reinterpret_cast<VecV*>(params.values), params.scores,
              params.found_functor, params.n);
    } else if (params.dim > GROUP_SIZE) {
      using CopyValue = CopyValueTwoGroup<VecV, GROUP_SIZE>;
      lookup_kernel_with_io_pipeline_v1<K, V, S, VecV, CopyScore, CopyValue,
                                        decltype(params.found_functor), VecSize>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, params.buckets_num, params.dim, params.keys,
              reinterpret_cast<VecV*>(params.values), params.scores,
              params.found_functor, params.n);
    } else {
      using CopyValue = CopyValueOneGroup<VecV, GROUP_SIZE>;
      lookup_kernel_with_io_pipeline_v1<K, V, S, VecV, CopyScore, CopyValue,
                                        decltype(params.found_functor), VecSize>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, params.buckets_num, params.dim, params.keys,
              reinterpret_cast<VecV*>(params.values), params.scores,
              params.found_functor, params.n);
    }
  }
};

template <typename K, typename V, typename S, typename CopyScore, typename VecV,
          uint32_t ValueBufSize>
struct LaunchPipelineLookupV2 {
  template <template <typename, typename, typename> typename LookupKernelParams>
  static void launch_kernel(LookupKernelParams<K, V, S>& params,
                            cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    // Using 16 threads to deal with one key
    constexpr int GROUP_SIZE = 16;
    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    constexpr uint32_t VecSize = ValueBufSize / sizeof(VecV);
    if (params.dim > (GROUP_SIZE * 2)) {
      using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;
      lookup_kernel_with_io_pipeline_v2<K, V, S, VecV, CopyScore, CopyValue,
                                        decltype(params.found_functor), VecSize>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, params.buckets_num, params.dim, params.keys,
              reinterpret_cast<VecV*>(params.values), params.scores,
              params.found_functor, params.n);
    } else if (params.dim > GROUP_SIZE) {
      using CopyValue = CopyValueTwoGroup<VecV, GROUP_SIZE>;
      lookup_kernel_with_io_pipeline_v2<K, V, S, VecV, CopyScore, CopyValue,
                                        decltype(params.found_functor), VecSize>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, params.buckets_num, params.dim, params.keys,
              reinterpret_cast<VecV*>(params.values), params.scores,
              params.found_functor, params.n);
    } else {
      using CopyValue = CopyValueOneGroup<VecV, GROUP_SIZE>;
      lookup_kernel_with_io_pipeline_v2<K, V, S, VecV, CopyScore, CopyValue,
                                        decltype(params.found_functor), VecSize>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, params.buckets_num, params.dim, params.keys,
              reinterpret_cast<VecV*>(params.values), params.scores,
              params.found_functor, params.n);
    }
  }
};

template <typename ArchTag>
struct LookupValueBufConfig;

/// TODO: support more arch
template <>
struct LookupValueBufConfig<Sm80> {
  static constexpr uint32_t size_pipeline_v1 = 224 * sizeof(float);
  static constexpr uint32_t size_pipeline_v2 = 128 * sizeof(float);
};

template <>
struct LookupValueBufConfig<Sm70> {
  static constexpr uint32_t size_pipeline_v1 = 112 * sizeof(float);
  static constexpr uint32_t size_pipeline_v2 = 64 * sizeof(float);
};

template <typename K, typename V, typename S = uint64_t,
          typename ArchTag = Sm80>
struct SelectPipelineLookupKernelWithIO {
  using ValueBufConfig = LookupValueBufConfig<ArchTag>;

  static inline uint32_t max_value_size() {
    return ValueBufConfig::size_pipeline_v1;
  }

  template <template <typename, typename, typename> typename LookupKernelParams>
  static void select_kernel(LookupKernelParams<K, V, S>& params,
                            cudaStream_t& stream) {
    constexpr int BUCKET_SIZE = 128;
    constexpr uint32_t buf_size_v1 = ValueBufConfig::size_pipeline_v1;
    constexpr uint32_t buf_size_v2 = ValueBufConfig::size_pipeline_v2;

    uint32_t total_value_size = static_cast<uint32_t>(params.dim * sizeof(V));

    if (params.scores == nullptr) {
      using CopyScore = CopyScoreEmpty<S, K, BUCKET_SIZE>;
      if (total_value_size <= buf_size_v1) {
        if (total_value_size % sizeof(float4) == 0) {
          using VecV = float4;
          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params, stream);
        } else if (total_value_size % sizeof(float2) == 0) {
          using VecV = float2;
          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params, stream);
        } else if (total_value_size % sizeof(float) == 0) {
          using VecV = float;
          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params, stream);
        } else if (total_value_size % sizeof(uint16_t) == 0) {
          using VecV = uint16_t;
          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params, stream);
        } else {
          using VecV = uint8_t;
          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params, stream);
        }
      } else {
        if (total_value_size % sizeof(float4) == 0) {
          using VecV = float4;
          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,
                                 buf_size_v2>::launch_kernel(params, stream);
        } else if (total_value_size % sizeof(float2) == 0) {
          using VecV = float2;
          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,
                                 buf_size_v2>::launch_kernel(params, stream);
        } else if (total_value_size % sizeof(float) == 0) {
          using VecV = float;
          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,
                                 buf_size_v2>::launch_kernel(params, stream);
        } else if (total_value_size % sizeof(uint16_t) == 0) {
          using VecV = uint16_t;
          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,
                                 buf_size_v2>::launch_kernel(params, stream);
        } else {
          using VecV = uint8_t;
          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,
                                 buf_size_v2>::launch_kernel(params, stream);
        }
      }
    } else {
      using CopyScore = CopyScoreByPassCache<S, K, BUCKET_SIZE>;
      if (total_value_size <= buf_size_v1) {
        if (total_value_size % sizeof(float4) == 0) {
          using VecV = float4;
          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params, stream);
        } else if (total_value_size % sizeof(float2) == 0) {
          using VecV = float2;
          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params, stream);
        } else if (total_value_size % sizeof(float) == 0) {
          using VecV = float;
          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params, stream);
        } else if (total_value_size % sizeof(uint16_t) == 0) {
          using VecV = uint16_t;
          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params, stream);
        } else {
          using VecV = uint8_t;
          LaunchPipelineLookupV1<K, V, S, CopyScore, VecV,
                                 buf_size_v1>::launch_kernel(params, stream);
        }
      } else {
        if (total_value_size % sizeof(float4) == 0) {
          using VecV = float4;
          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,
                                 buf_size_v2>::launch_kernel(params, stream);
        } else if (total_value_size % sizeof(float2) == 0) {
          using VecV = float2;
          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,
                                 buf_size_v2>::launch_kernel(params, stream);
        } else if (total_value_size % sizeof(float) == 0) {
          using VecV = float;
          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,
                                 buf_size_v2>::launch_kernel(params, stream);
        } else if (total_value_size % sizeof(uint16_t) == 0) {
          using VecV = uint16_t;
          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,
                                 buf_size_v2>::launch_kernel(params, stream);
        } else {
          using VecV = uint8_t;
          LaunchPipelineLookupV2<K, V, S, CopyScore, VecV,
                                 buf_size_v2>::launch_kernel(params, stream);
        }
      }
    }
  }  // End function
};

/* lookup with IO operation. This kernel is
 * usually used for the pure HBM mode for better performance.
 */
template <class K, class V, class S, class FoundFunctor, uint32_t TILE_SIZE = 4>
__global__ void lookup_kernel_with_io(
    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,
    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,
    const K* __restrict keys, V* __restrict values, S* __restrict scores,
    FoundFunctor found_functor, size_t N) {
  int* buckets_size = table->buckets_size;

  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int rank = g.thread_rank();

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_idx = t / TILE_SIZE;

    const K find_key = keys[key_idx];
    if (IS_RESERVED_KEY<K>(find_key)) continue;

    V* find_value = values + key_idx * dim;

    int key_pos = -1;
    int src_lane = -1;
    size_t bkt_idx = 0;
    size_t start_idx = 0;

    Bucket<K, V, S>* bucket = get_key_position<K>(
        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);

    const int bucket_size = buckets_size[bkt_idx];
    if (bucket_size >= bucket_max_size) {
      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
    }

    OccupyResult occupy_result{OccupyResult::INITIAL};
    occupy_result = find_without_lock<K, V, S, TILE_SIZE>(
        g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size);

    bool found = occupy_result == OccupyResult::DUPLICATE;
    if (found) {
      copy_vector<V, TILE_SIZE>(g, bucket->vectors + key_pos * dim, find_value,
                                dim);
      bool found = (rank == src_lane);
      if (found) {
        if (scores != nullptr) {
          *(scores + key_idx) =
              bucket->scores(key_pos)->load(cuda::std::memory_order_relaxed);
        }
      }
    }
    if (rank == 0) {
      found_functor(key_idx, find_key, found);
    }
  }
}

template <typename K, typename V, typename S, typename FoundFunctor>
struct SelectLookupKernelWithIOImpl {
  static void execute_kernel(const float& load_factor, const int& block_size,
                             const size_t bucket_max_size,
                             const size_t buckets_num, const size_t dim,
                             cudaStream_t& stream, const size_t& n,
                             const Table<K, V, S>* __restrict table,
                             Bucket<K, V, S>* buckets, const K* __restrict keys,
                             V* __restrict values, S* __restrict scores,
                             const FoundFunctor& found_functor) {
    if (load_factor <= 0.75) {
      const unsigned int tile_size = 4;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      lookup_kernel_with_io<K, V, S, FoundFunctor, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, found_functor, N);
    } else {
      const unsigned int tile_size = 16;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      lookup_kernel_with_io<K, V, S, FoundFunctor, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, found_functor, N);
    }
    return;
  }
};

template <typename K, typename V, typename S>
struct SelectLookupKernelWithIO {
  static void execute_kernel(const float& load_factor, const int& block_size,
                             const size_t bucket_max_size,
                             const size_t buckets_num, const size_t dim,
                             cudaStream_t& stream, const size_t& n,
                             const Table<K, V, S>* __restrict table,
                             Bucket<K, V, S>* buckets, const K* __restrict keys,
                             V* __restrict values, S* __restrict scores,
                             bool* __restrict found) {
    FoundFunctorV1<K> found_functor(found);
    SelectLookupKernelWithIOImpl<K, V, S, decltype(found_functor)>::
        execute_kernel(load_factor, block_size, bucket_max_size, buckets_num,
                       dim, stream, n, table, buckets, keys, values, scores,
                       found_functor);
  }
};

template <typename K, typename V, typename S>
struct SelectLookupKernelWithIOV2 {
  static void execute_kernel(const float& load_factor, const int& block_size,
                             const size_t bucket_max_size,
                             const size_t buckets_num, const size_t dim,
                             cudaStream_t& stream, const size_t& n,
                             const Table<K, V, S>* __restrict table,
                             Bucket<K, V, S>* buckets, const K* __restrict keys,
                             V* __restrict values, S* __restrict scores,
                             K* __restrict missed_keys,
                             int* __restrict missed_indices,
                             int* __restrict missed_size) {
    FoundFunctorV2<K> found_functor(missed_keys, missed_indices, missed_size);
    SelectLookupKernelWithIOImpl<K, V, S, decltype(found_functor)>::
        execute_kernel(load_factor, block_size, bucket_max_size, buckets_num,
                       dim, stream, n, table, buckets, keys, values, scores,
                       found_functor);
  }
};

// Use 1 thread to deal with a KV-pair, exculing copying value.
template <typename K, typename V, typename S,
          typename FoundFunctor = FoundFunctorV1<K>>
__device__ void tlp_lookup_kernel_hybrid_impl(
    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,
    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,
    V** __restrict values, S* __restrict scores, int* __restrict dst_offset,
    FoundFunctor found_functor, uint64_t n) {
  using BUCKET = Bucket<K, V, S>;

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  S score{static_cast<S>(EMPTY_SCORE)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  V* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  uint32_t key_pos = {0};
  if (kv_idx < n) {
    key = keys[kv_idx];
    if (dst_offset) dst_offset[kv_idx] = kv_idx;
    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = reinterpret_cast<V*>(bucket->vectors);
    } else {
      return;
    }
  } else {
    return;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = bucket_keys_ptr[possible_pos];
        score = *BUCKET::scores(bucket_keys_ptr, bucket_capacity, possible_pos);
        bool found = (current_key == key);
        if (found) {
          key_pos = possible_pos;
          if (scores) {
            scores[kv_idx] = score;
          }
          values[kv_idx] = bucket_values_ptr + key_pos * dim;
          found_functor(kv_idx, key, true);
          return;
        } else {
          values[kv_idx] = nullptr;
        }
      } while (true);
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = bucket_keys_ptr[possible_pos];
        if (current_key == static_cast<K>(EMPTY_KEY)) {
          found_functor(kv_idx, key, false);
          return;
        }
      } while (true);
    }
  }

  found_functor(kv_idx, key, false);
}

template <typename K, typename V, typename S>
__global__ void tlp_lookup_kernel_hybrid(
    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,
    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,
    V** __restrict values, S* __restrict scores, int* __restrict dst_offset,
    bool* __restrict founds, uint64_t n) {
  FoundFunctorV1<K> found_functor(founds);
  tlp_lookup_kernel_hybrid_impl<K, V, S, decltype(found_functor)>(
      buckets, buckets_num, bucket_capacity, dim, keys, values, scores,
      dst_offset, found_functor, n);
}

template <typename K, typename V, typename S>
__global__ void tlp_lookup_kernel_hybrid(
    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,
    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,
    V** __restrict values, S* __restrict scores, int* __restrict dst_offset,
    K* __restrict missed_keys, int* __restrict missed_indices,
    int* __restrict missed_size, uint64_t n) {
  FoundFunctorV2<K> found_functor(missed_keys, missed_indices, missed_size);
  tlp_lookup_kernel_hybrid_impl<K, V, S, decltype(found_functor)>(
      buckets, buckets_num, bucket_capacity, dim, keys, values, scores,
      dst_offset, found_functor, n);
}

/* lookup kernel.
 */
template <class K, class V, class S, class FoundFunctor, uint32_t TILE_SIZE = 4>
__device__ void lookup_kernel_impl(
    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,
    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,
    const K* __restrict keys, V** __restrict values, S* __restrict scores,
    FoundFunctor found_functor, int* __restrict dst_offset, size_t N) {
  int* buckets_size = table->buckets_size;

  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int rank = g.thread_rank();

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_idx = t / TILE_SIZE;

    const K find_key = keys[key_idx];
    if (IS_RESERVED_KEY<K>(find_key)) continue;

    int key_pos = -1;
    int src_lane = -1;
    size_t bkt_idx = 0;
    size_t start_idx = 0;

    Bucket<K, V, S>* bucket = get_key_position<K>(
        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);

    const int bucket_size = buckets_size[bkt_idx];
    if (bucket_size >= bucket_max_size) {
      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
    }

    if (dst_offset != nullptr && rank == 0) {
      *(dst_offset + key_idx) = key_idx;
    }

    OccupyResult occupy_result{OccupyResult::INITIAL};
    occupy_result = find_without_lock<K, V, S, TILE_SIZE>(
        g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size);

    bool found = (occupy_result == OccupyResult::DUPLICATE);
    if (found) {
      if (rank == src_lane) {
        *(values + key_idx) = (bucket->vectors + key_pos * dim);
        if (scores != nullptr) {
          *(scores + key_idx) =
              bucket->scores(key_pos)->load(cuda::std::memory_order_relaxed);
        }
      }
    } else {
      if (rank == 0) {
        *(values + key_idx) = nullptr;
      }
    }

    if (rank == 0) {
      found_functor(key_idx, find_key, found);
    }
  }
}

template <class K, class V, class S, uint32_t TILE_SIZE = 4>
__global__ void lookup_kernel(const Table<K, V, S>* __restrict table,
                              Bucket<K, V, S>* buckets,
                              const size_t bucket_max_size,
                              const size_t buckets_num, const size_t dim,
                              const K* __restrict keys, V** __restrict values,
                              S* __restrict scores, bool* __restrict founds,
                              int* __restrict dst_offset, size_t N) {
  FoundFunctorV1<K> found_functor(founds);
  lookup_kernel_impl<K, V, S, decltype(found_functor), TILE_SIZE>(
      table, buckets, bucket_max_size, buckets_num, dim, keys, values, scores,
      found_functor, dst_offset, N);
}

template <class K, class V, class S, uint32_t TILE_SIZE = 4>
__global__ void lookup_kernel(
    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,
    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,
    const K* __restrict keys, V** __restrict values, S* __restrict scores,
    K* __restrict missed_keys, int* __restrict missed_indices,
    int* __restrict missed_size, int* __restrict dst_offset, size_t N) {
  FoundFunctorV2<K> found_functor(missed_keys, missed_indices, missed_size);
  lookup_kernel_impl<K, V, S, decltype(found_functor), TILE_SIZE>(
      table, buckets, bucket_max_size, buckets_num, dim, keys, values, scores,
      found_functor, dst_offset, N);
}

}  // namespace merlin
}  // namespace nv


================================================
FILE: include/merlin/core_kernels/lookup_ptr.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "kernel_utils.cuh"

namespace nv {
namespace merlin {

// Use 1 thread to deal with a KV-pair, including copying value.
template <typename K, typename V, typename S, int Strategy>
__global__ void tlp_lookup_ptr_kernel_with_filter(
    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,
    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,
    V** __restrict values, S* __restrict scores, bool* __restrict founds,
    uint64_t n, bool update_score, const S global_epoch) {
  using BUCKET = Bucket<K, V, S>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;
  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  S score{static_cast<S>(EMPTY_SCORE)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  V* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  uint32_t key_pos = {0};
  if (kv_idx < n) {
    key = keys[kv_idx];
    if (update_score) {
      score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    }
    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = reinterpret_cast<V*>(bucket->vectors);
    } else {
      occupy_result = OccupyResult::ILLEGAL;
      goto WRITE_BACK;
    }
  } else {
    return;
  }

  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (update_score) {
          auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
          K expected_key = key;
          // Modifications to the bucket will not before this instruction.
          bool result = current_key->compare_exchange_strong(
              expected_key, static_cast<K>(LOCKED_KEY),
              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
          if (result) {
            occupy_result = OccupyResult::DUPLICATE;
            key_pos = possible_pos;
            ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                             kv_idx, score, bucket_capacity,
                                             get_digest<K>(key), false);
            current_key->store(key, cuda::std::memory_order_release);
            score = *BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);
            goto WRITE_BACK;
          }
        } else {
          auto current_key = bucket_keys_ptr[possible_pos];
          score =
              *BUCKET::scores(bucket_keys_ptr, bucket_capacity, possible_pos);
          if (current_key == key) {
            key_pos = possible_pos;
            occupy_result = OccupyResult::DUPLICATE;
            goto WRITE_BACK;
          }
        }
      } while (true);
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = bucket_keys_ptr[possible_pos];
        if (current_key == static_cast<K>(EMPTY_KEY)) {
          occupy_result = OccupyResult::OCCUPIED_EMPTY;
          goto WRITE_BACK;
        }
      } while (true);
    }
  }

WRITE_BACK:
  bool found_ = occupy_result == OccupyResult::DUPLICATE;
  if (founds) {
    founds[kv_idx] = found_;
  }
  if (found_) {
    if (scores) {
      scores[kv_idx] = score;
    }
    values[kv_idx] = bucket_values_ptr + key_pos * dim;
  } else {
    values[kv_idx] = nullptr;
  }
}

// Pipelined pointer-return lookup: reuses the cooperative 32-thread digest scan
// from lookup_kernel_with_io_pipeline_v1 (the value-copy find kernel) but skips
// the value copy stages entirely, writing only V* pointers.  This ensures find*
// throughput at lambda=1.0 is always >= find throughput.
template <typename K, typename V, typename S>
__global__ void lookup_ptr_kernel_with_pipeline(
    Bucket<K, V, S>* buckets, const size_t buckets_num, const int dim,
    const K* __restrict keys, V** __restrict values, S* __restrict scores,
    bool* __restrict founds, size_t n) {
  constexpr int GROUP_SIZE = 32;
  constexpr int RESERVE = 16;
  constexpr int BLOCK_SIZE = 128;
  constexpr int BUCKET_SIZE = 128;
  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;
  constexpr int DIGEST_SPAN = BUCKET_SIZE / 4;

  __shared__ int sm_target_digests[BLOCK_SIZE];
  __shared__ K sm_target_keys[BLOCK_SIZE];
  __shared__ K* sm_keys_ptr[BLOCK_SIZE];
  __shared__ V* sm_values_ptr[BLOCK_SIZE];
  // Reuse
  int* sm_counts = sm_target_digests;
  int* sm_founds = sm_counts;
  // Double buffer
  __shared__ uint32_t sm_probing_digests[2][GROUP_NUM * DIGEST_SPAN];
  __shared__ K sm_possible_keys[2][GROUP_NUM * RESERVE];
  __shared__ int sm_possible_pos[2][GROUP_NUM * RESERVE];

  // Initialization
  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());
  int groupID = threadIdx.x / GROUP_SIZE;
  int rank = g.thread_rank();
  int key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;
  if (key_idx_base >= n) return;
  int loop_num =
      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;
  if (rank < loop_num) {
    int idx_block = groupID * GROUP_SIZE + rank;
    K target_key = keys[key_idx_base + rank];
    sm_target_keys[idx_block] = target_key;
    const K hashed_key = Murmur3HashDevice(target_key);
    const uint8_t target_digest = static_cast<uint8_t>(hashed_key >> 32);
    sm_target_digests[idx_block] = static_cast<uint32_t>(target_digest);
    int global_idx = hashed_key % (buckets_num * BUCKET_SIZE);
    int bkt_idx = global_idx / BUCKET_SIZE;
    Bucket<K, V, S>* bucket = buckets + bkt_idx;
    __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(),
                            sizeof(K*));
    __pipeline_commit();
    __pipeline_memcpy_async(sm_values_ptr + idx_block, &(bucket->vectors),
                            sizeof(V*));
  }
  __pipeline_wait_prior(0);

  // Pipeline loading: prefetch digests for the first key
  uint8_t* digests_ptr =
      reinterpret_cast<uint8_t*>(sm_keys_ptr[groupID * GROUP_SIZE]) -
      BUCKET_SIZE;
  __pipeline_memcpy_async(sm_probing_digests[0] + groupID * DIGEST_SPAN + rank,
                          digests_ptr + rank * 4, sizeof(uint32_t));
  __pipeline_commit();
  __pipeline_commit();

  for (int i = 0; i < loop_num; i++) {
    int key_idx_block = groupID * GROUP_SIZE + i;

    /* Step1: prefetch all digests in one bucket */
    if ((i + 1) < loop_num) {
      uint8_t* digests_ptr =
          reinterpret_cast<uint8_t*>(sm_keys_ptr[key_idx_block + 1]) -
          BUCKET_SIZE;
      __pipeline_memcpy_async(
          sm_probing_digests[diff_buf(i)] + groupID * DIGEST_SPAN + rank,
          digests_ptr + rank * 4, sizeof(uint32_t));
    }
    __pipeline_commit();

    /* Step2: check digests and load possible keys */
    uint32_t target_digest = sm_target_digests[key_idx_block];
    uint32_t target_digests = __byte_perm(target_digest, target_digest, 0x0000);
    sm_counts[key_idx_block] = 0;
    __pipeline_wait_prior(2);
    uint32_t probing_digests =
        sm_probing_digests[same_buf(i)][groupID * DIGEST_SPAN + rank];
    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests);
    uint32_t find_result = 0;
    if ((find_result_ & 0x01) != 0) find_result |= 0x01;
    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;
    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;
    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;
    int find_number = __popc(find_result);
    int group_base = 0;
    if (find_number > 0) {
      group_base = atomicAdd(sm_counts + key_idx_block, find_number);
    }
    bool gt_reserve = (group_base + find_number) > RESERVE;
    int gt_vote = g.ballot(gt_reserve);
    K* key_ptr = sm_keys_ptr[key_idx_block];
    if (gt_vote == 0) {
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = rank * 4 + digest_idx;
          sm_possible_pos[same_buf(i)][groupID * RESERVE + group_base] =
              key_pos;
          __pipeline_memcpy_async(
              sm_possible_keys[same_buf(i)] + (groupID * RESERVE + group_base),
              key_ptr + key_pos, sizeof(K));
          group_base += 1;
        } else {
          break;
        }
      } while (true);
    } else {
      K target_key = sm_target_keys[key_idx_block];
      sm_counts[key_idx_block] = 0;
      int found_vote = 0;
      bool found = false;
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = rank * 4 + digest_idx;
          K possible_key = key_ptr[key_pos];
          if (possible_key == target_key) {
            found = true;
            sm_counts[key_idx_block] = 1;
            sm_possible_pos[same_buf(i)][groupID * RESERVE] = key_pos;
            sm_possible_keys[same_buf(i)][groupID * RESERVE] = possible_key;
          }
        }
        found_vote = g.ballot(found);
        if (found_vote) {
          break;
        }
        found_vote = digest_idx >= 0;
      } while (g.any(found_vote));
    }
    __pipeline_commit();

    /* Step3: check possible keys, write back pointer immediately */
    if (i > 0) {
      int prev_idx_block = key_idx_block - 1;
      K target_key = sm_target_keys[prev_idx_block];
      int possible_num = sm_counts[prev_idx_block];
      sm_founds[prev_idx_block] = 0;
      __pipeline_wait_prior(2);
      int key_pos = -1;
      bool found_flag = false;
      if (rank < possible_num) {
        K possible_key =
            sm_possible_keys[diff_buf(i)][groupID * RESERVE + rank];
        key_pos = sm_possible_pos[diff_buf(i)][groupID * RESERVE + rank];
        if (possible_key == target_key) {
          found_flag = true;
        }
      }
      int found_vote = g.ballot(found_flag);
      if (found_vote) {
        sm_founds[prev_idx_block] = 1;
        int src_lane = __ffs(found_vote) - 1;
        int target_pos = g.shfl(key_pos, src_lane);
        // Write pointer directly (no value copy needed).
        if (rank == 0) {
          int key_idx_grid = blockIdx.x * blockDim.x + prev_idx_block;
          values[key_idx_grid] =
              sm_values_ptr[prev_idx_block] + target_pos * dim;
          if (scores) {
            S* score_ptr =
                reinterpret_cast<S*>(sm_keys_ptr[prev_idx_block] + BUCKET_SIZE);
            scores[key_idx_grid] = score_ptr[target_pos];
          }
        }
      }
    }
  }  // End loop

  /* Pipeline emptying: process the last key */
  {
    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);
    K target_key = sm_target_keys[key_idx_block];
    int possible_num = sm_counts[key_idx_block];
    sm_founds[key_idx_block] = 0;
    __pipeline_wait_prior(0);
    int key_pos = -1;
    bool found_flag = false;
    if (rank < possible_num) {
      key_pos = sm_possible_pos[diff_buf(loop_num)][groupID * RESERVE + rank];
      K possible_key =
          sm_possible_keys[diff_buf(loop_num)][groupID * RESERVE + rank];
      if (target_key == possible_key) {
        found_flag = true;
      }
    }
    int found_vote = g.ballot(found_flag);
    if (found_vote) {
      sm_founds[key_idx_block] = 1;
      int src_lane = __ffs(found_vote) - 1;
      int target_pos = g.shfl(key_pos, src_lane);
      if (rank == 0) {
        int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
        values[key_idx_grid] = sm_values_ptr[key_idx_block] + target_pos * dim;
        if (scores) {
          S* score_ptr =
              reinterpret_cast<S*>(sm_keys_ptr[key_idx_block] + BUCKET_SIZE);
          scores[key_idx_grid] = score_ptr[target_pos];
        }
      }
    }
  }

  // Write found flags and nullptr for misses.
  if (rank < loop_num) {
    int key_idx_block = groupID * GROUP_SIZE + rank;
    int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
    bool found_ = sm_founds[key_idx_block] > 0;
    if (founds) founds[key_idx_grid] = found_;
    if (!found_) values[key_idx_grid] = nullptr;
  }
}

/* lookup with IO operation. This kernel is
 * usually used for the pure HBM mode for better performance.
 */
template <class K, class V, class S, uint32_t TILE_SIZE = 4>
__global__ void lookup_ptr_kernel(const Table<K, V, S>* __restrict table,
                                  Bucket<K, V, S>* buckets,
                                  const size_t bucket_max_size,
                                  const size_t buckets_num, const size_t dim,
                                  const K* __restrict keys,
                                  V** __restrict values, S* __restrict scores,
                                  bool* __restrict found, size_t N) {
  int* buckets_size = table->buckets_size;

  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int rank = g.thread_rank();

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_idx = t / TILE_SIZE;

    const K find_key = keys[key_idx];
    OccupyResult occupy_result{OccupyResult::INITIAL};
    int key_pos = -1;
    int src_lane = -1;
    Bucket<K, V, S>* bucket{nullptr};
    if (!IS_RESERVED_KEY<K>(find_key)) {
      size_t bkt_idx = 0;
      size_t start_idx = 0;

      bucket = get_key_position<K>(buckets, find_key, bkt_idx, start_idx,
                                   buckets_num, bucket_max_size);

      const int bucket_size = buckets_size[bkt_idx];
      if (bucket_size >= bucket_max_size) {
        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
      }

      occupy_result = find_without_lock<K, V, S, TILE_SIZE>(
          g, bucket, find_key, start_idx, key_pos, src_lane, bucket_max_size);
    } else {
      occupy_result = OccupyResult::ILLEGAL;
    }

    if (rank == src_lane) {
      bool found_ = occupy_result == OccupyResult::DUPLICATE;
      if (found != nullptr) {
        *(found + key_idx) = found_;
      }
      if (found_) {
        values[key_idx] = bucket->vectors + key_pos * dim;
        if (scores != nullptr) {
          *(scores + key_idx) =
              bucket->scores(key_pos)->load(cuda::std::memory_order_relaxed);
        }
      } else {
        values[key_idx] = nullptr;
      }
    }
  }
}

template <typename K, typename V, typename S>
struct SelectLookupPtrKernel {
  static void execute_kernel(const float& load_factor, const int& block_size,
                             const size_t bucket_max_size,
                             const size_t buckets_num, const size_t dim,
                             cudaStream_t& stream, const size_t& n,
                             const Table<K, V, S>* __restrict table,
                             Bucket<K, V, S>* buckets, const K* __restrict keys,
                             V** __restrict values, S* __restrict scores,
                             bool* __restrict found) {
    if (load_factor <= 0.75) {
      const unsigned int tile_size = 4;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      lookup_ptr_kernel<K, V, S, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, found, N);
    } else {
      const unsigned int tile_size = 16;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      lookup_ptr_kernel<K, V, S, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, found, N);
    }
    return;
  }
};

}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/core_kernels/update.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "kernel_utils.cuh"

namespace nv {
namespace merlin {

// Use 1 thread to deal with a KV-pair, including copying value.
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          typename VecV = byte16, uint32_t BLOCK_SIZE = 128, int Strategy = -1>
__global__ void tlp_update_kernel_with_io(
    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,
    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,
    const VecV* __restrict__ values, const S* __restrict__ scores, uint64_t n,
    const S global_epoch) {
  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, 1>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  VecV* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  uint32_t key_pos = {0};
  if (kv_idx < n) {
    key = keys[kv_idx];

    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);
    } else {
      return;
    }
  } else {
    return;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::DUPLICATE;
        key_pos = possible_pos;
        ScoreFunctor::update_without_missed(bucket_keys_ptr, bucket_capacity,
                                            key_pos, scores, kv_idx,
                                            global_epoch);
        VecV* bucket_value_ptr = bucket_values_ptr + key_pos * dim;
        const VecV* param_value_ptr = values + kv_idx * dim;
        CopyValue::ldg_stg(0, bucket_value_ptr, param_value_ptr, dim);
        auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
        // memory_order_release:
        // Modifications to the bucket will not after this instruction.
        key_address->store(key, cuda::std::memory_order_release);
        return;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        auto probe_key = current_key->load(cuda::std::memory_order_relaxed);
        if (probe_key == static_cast<K>(EMPTY_KEY)) {
          return;
        }
      } while (true);
    }
  }
}
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          typename VecV = byte16, uint32_t BLOCK_SIZE = 128,
          uint32_t GROUP_SIZE = 16, int Strategy = -1>
__global__ void pipeline_update_kernel_with_io(
    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,
    const uint32_t dim, const K* __restrict__ keys,
    const VecV* __restrict__ values, const S* __restrict__ scores, uint64_t n,
    const S global_epoch) {
  constexpr uint32_t BUCKET_SIZE = 128;
  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;
  constexpr uint32_t Comp_LEN = sizeof(VecD_Comp) / sizeof(D);
  // Here, GROUP_SIZE * Load_LEN = BUCKET_SIZE.
  using VecD_Load = byte8;
  constexpr uint32_t Load_LEN = sizeof(VecD_Load) / sizeof(D);
  constexpr int RESERVE = 8;

  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;
  using CopyScore = CopyScoreByPassCache<S, K, BUCKET_SIZE>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  __shared__ VecD_Comp sm_target_digests[BLOCK_SIZE];
  __shared__ K sm_target_keys[BLOCK_SIZE];
  __shared__ K* sm_keys_ptr[BLOCK_SIZE];
  __shared__ VecV* sm_values_ptr[BLOCK_SIZE];
  // Reuse
  int* sm_counts = reinterpret_cast<int*>(sm_target_digests);
  int* sm_position = sm_counts;
  // Double buffer
  __shared__ D sm_digests[GROUP_NUM][2 * BUCKET_SIZE];
  __shared__ K sm_possible_keys[GROUP_NUM][2 * RESERVE];
  __shared__ int sm_possible_pos[GROUP_NUM][2 * RESERVE];
  __shared__ S sm_scores[GROUP_NUM][2];
  __shared__ int sm_ranks[GROUP_NUM][2];
  // __shared__ VecV sm_values_buffer[GROUP_NUM][2 * dim];

  extern __shared__ __align__(alignof(byte16)) byte sm_values_buffer[];

  bool CAS_res[2]{false};

  // Initialization
  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());
  int groupID = threadIdx.x / GROUP_SIZE;
  int rank = g.thread_rank();
  uint64_t key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;
  if (key_idx_base >= n) return;
  int loop_num =
      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;
  if (rank < loop_num) {
    int idx_block = groupID * GROUP_SIZE + rank;
    K key = keys[key_idx_base + rank];
    sm_target_keys[idx_block] = key;
    const K hashed_key = Murmur3HashDevice(key);
    sm_target_digests[idx_block] = digests_from_hashed<K>(hashed_key);
    uint64_t global_idx = hashed_key % (buckets_num * BUCKET_SIZE);
    uint64_t bkt_idx = global_idx / BUCKET_SIZE;
    Bucket<K, V, S>* bucket = buckets + bkt_idx;
    __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(),
                            sizeof(K*));
    __pipeline_commit();
    __pipeline_memcpy_async(sm_values_ptr + idx_block, &(bucket->vectors),
                            sizeof(VecV*));
  }
  __pipeline_wait_prior(0);

  // Pipeline loading
  K* keys_ptr = sm_keys_ptr[groupID * GROUP_SIZE];
  D* digests_ptr = BUCKET::digests(keys_ptr, BUCKET_SIZE, rank * Load_LEN);
  __pipeline_memcpy_async(sm_digests[groupID] + rank * Load_LEN, digests_ptr,
                          sizeof(VecD_Load));
  __pipeline_commit();
  // Padding, meet the param of the first `__pipeline_wait_prior`
  // in the first loop.
  __pipeline_commit();
  __pipeline_commit();

  for (int i = 0; i < loop_num; i++) {
    int key_idx_block = groupID * GROUP_SIZE + i;

    /* Step1: prefetch all digests in one bucket */
    if ((i + 1) < loop_num) {
      K* keys_ptr = sm_keys_ptr[key_idx_block + 1];
      D* digests_ptr = BUCKET::digests(keys_ptr, BUCKET_SIZE, rank * Load_LEN);
      __pipeline_memcpy_async(
          sm_digests[groupID] + diff_buf(i) * BUCKET_SIZE + rank * Load_LEN,
          digests_ptr, sizeof(VecD_Load));
    }
    __pipeline_commit();

    /* Step2: check digests and load possible keys */
    VecD_Comp target_digests = sm_target_digests[key_idx_block];
    sm_counts[key_idx_block] = 0;
    __pipeline_wait_prior(3);
    VecD_Comp probing_digests = *reinterpret_cast<VecD_Comp*>(
        &sm_digests[groupID][same_buf(i) * BUCKET_SIZE + rank * Comp_LEN]);
    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests);
    uint32_t find_result = 0;
    if ((find_result_ & 0x01) != 0) find_result |= 0x01;
    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;
    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;
    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;
    probing_digests = *reinterpret_cast<VecD_Comp*>(
        &sm_digests[groupID][same_buf(i) * BUCKET_SIZE +
                             (GROUP_SIZE + rank) * Comp_LEN]);
    find_result_ = __vcmpeq4(probing_digests, target_digests);
    if ((find_result_ & 0x01) != 0) find_result |= 0x10;
    if ((find_result_ & 0x0100) != 0) find_result |= 0x20;
    if ((find_result_ & 0x010000) != 0) find_result |= 0x40;
    if ((find_result_ & 0x01000000) != 0) find_result |= 0x80;
    int find_number = __popc(find_result);
    int group_base = 0;
    if (find_number > 0) {
      group_base = atomicAdd(sm_counts + key_idx_block, find_number);
    }
    bool gt_reserve = (group_base + find_number) > RESERVE;
    int gt_vote = g.ballot(gt_reserve);
    K* key_ptr = sm_keys_ptr[key_idx_block];
    if (gt_vote == 0) {
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = digest_idx < 4
                            ? (rank * 4 + digest_idx)
                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);
          sm_possible_pos[groupID][same_buf(i) * RESERVE + group_base] =
              key_pos;
          __pipeline_memcpy_async(
              sm_possible_keys[groupID] + same_buf(i) * RESERVE + group_base,
              key_ptr + key_pos, sizeof(K));
          group_base += 1;
        } else {
          break;
        }
      } while (true);
    } else {
      K target_key = sm_target_keys[key_idx_block];
      sm_counts[key_idx_block] = 0;
      int found_vote = 0;
      bool found = false;
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = digest_idx < 4
                            ? (rank * 4 + digest_idx)
                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);
          K possible_key = key_ptr[key_pos];
          if (possible_key == target_key) {
            found = true;
            sm_counts[key_idx_block] = 1;
            sm_possible_pos[groupID][same_buf(i) * RESERVE] = key_pos;
            sm_possible_keys[groupID][same_buf(i) * RESERVE] = possible_key;
          }
        }
        found_vote = g.ballot(found);
        if (found_vote) {
          break;
        }
        found_vote = digest_idx >= 0;
      } while (g.any(found_vote));
    }
    __pipeline_commit();

    /* Step3: check possible keys, and prefecth the value and score */
    if (i > 0) {
      key_idx_block -= 1;
      K target_key = sm_target_keys[key_idx_block];
      K* keys_ptr = sm_keys_ptr[key_idx_block];
      int possible_num = sm_counts[key_idx_block];
      sm_position[key_idx_block] = -1;
      __pipeline_wait_prior(3);
      int key_pos;
      bool found_flag = false;
      if (rank < possible_num) {
        K possible_key =
            sm_possible_keys[groupID][diff_buf(i) * RESERVE + rank];
        key_pos = sm_possible_pos[groupID][diff_buf(i) * RESERVE + rank];
        if (possible_key == target_key) {
          found_flag = true;
          if (scores) {
            int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
            const S* score_ptr = scores + key_idx_grid;
            CopyScore::ldg_sts(sm_scores[groupID] + diff_buf(i), score_ptr);
          }
          auto key_ptr = BUCKET::keys(keys_ptr, key_pos);
          sm_ranks[groupID][diff_buf(i)] = rank;
          if (diff_buf(i) == 0) {
            CAS_res[0] = key_ptr->compare_exchange_strong(
                possible_key, static_cast<K>(LOCKED_KEY),
                cuda::std::memory_order_acquire,
                cuda::std::memory_order_relaxed);
          } else {
            CAS_res[1] = key_ptr->compare_exchange_strong(
                possible_key, static_cast<K>(LOCKED_KEY),
                cuda::std::memory_order_acquire,
                cuda::std::memory_order_relaxed);
          }
        }
      }
      int found_vote = g.ballot(found_flag);
      if (found_vote) {
        int src_lane = __ffs(found_vote) - 1;
        int target_pos = g.shfl(key_pos, src_lane);
        sm_position[key_idx_block] = target_pos;
        int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
        const VecV* v_src = values + key_idx_grid * dim;
        auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);
        VecV* v_dst = tmp + (groupID * 2 + diff_buf(i)) * dim;
        CopyValue::ldg_sts(rank, v_dst, v_src, dim);
      }
    }
    __pipeline_commit();

    /* Step4: write back value and score */
    if (i > 1) {
      key_idx_block -= 1;
      VecV* value_ptr = sm_values_ptr[key_idx_block];
      int target_pos = sm_position[key_idx_block];
      K target_key = sm_target_keys[key_idx_block];
      K* keys_ptr = sm_keys_ptr[key_idx_block];
      int src_lane = sm_ranks[groupID][same_buf(i)];
      __pipeline_wait_prior(3);
      int succ = 0;
      if (rank == src_lane) {
        bool CAS_res_cur = same_buf(i) == 0 ? CAS_res[0] : CAS_res[1];
        succ = CAS_res_cur ? 1 : 0;
      }
      succ = g.shfl(succ, src_lane);
      if (target_pos >= 0 && succ == 1) {
        auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);
        VecV* v_src = tmp + (groupID * 2 + same_buf(i)) * dim;
        VecV* v_dst = value_ptr + target_pos * dim;
        CopyValue::lds_stg(rank, v_dst, v_src, dim);
        ScoreFunctor::update_without_missed(keys_ptr, BUCKET_SIZE, target_pos,
                                            sm_scores[groupID] + same_buf(i), 0,
                                            global_epoch);
        if (rank == 0) {
          auto key_address = BUCKET::keys(keys_ptr, target_pos);
          key_address->store(target_key, cuda::std::memory_order_release);
        }
      }
    }
  }  // End loop

  /* Pipeline emptying: step3, i = loop_num */
  {
    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);
    K target_key = sm_target_keys[key_idx_block];
    K* keys_ptr = sm_keys_ptr[key_idx_block];
    int possible_num = sm_counts[key_idx_block];
    sm_position[key_idx_block] = -1;
    __pipeline_wait_prior(1);
    int key_pos;
    bool found_flag = false;
    if (rank < possible_num) {
      K possible_key =
          sm_possible_keys[groupID][diff_buf(loop_num) * RESERVE + rank];
      key_pos = sm_possible_pos[groupID][diff_buf(loop_num) * RESERVE + rank];
      if (possible_key == target_key) {
        found_flag = true;
        if (scores) {
          int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
          const S* score_ptr = scores + key_idx_grid;
          CopyScore::ldg_sts(sm_scores[groupID] + diff_buf(loop_num),
                             score_ptr);
        }
        auto key_ptr = BUCKET::keys(keys_ptr, key_pos);
        sm_ranks[groupID][diff_buf(loop_num)] = rank;
        if (diff_buf(loop_num) == 0) {
          CAS_res[0] = key_ptr->compare_exchange_strong(
              possible_key, static_cast<K>(LOCKED_KEY),
              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
        } else {
          CAS_res[1] = key_ptr->compare_exchange_strong(
              possible_key, static_cast<K>(LOCKED_KEY),
              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
        }
      }
    }
    int found_vote = g.ballot(found_flag);
    if (found_vote) {
      int src_lane = __ffs(found_vote) - 1;
      int target_pos = g.shfl(key_pos, src_lane);
      sm_position[key_idx_block] = target_pos;
      int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
      const VecV* v_src = values + key_idx_grid * dim;
      auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);
      VecV* v_dst = tmp + (groupID * 2 + diff_buf(loop_num)) * dim;
      CopyValue::ldg_sts(rank, v_dst, v_src, dim);
    }
  }
  __pipeline_commit();

  /* Pipeline emptying: step4, i = loop_num */
  if (loop_num > 1) {
    int key_idx_block = groupID * GROUP_SIZE + loop_num - 2;
    VecV* value_ptr = sm_values_ptr[key_idx_block];
    int target_pos = sm_position[key_idx_block];
    K target_key = sm_target_keys[key_idx_block];
    K* keys_ptr = sm_keys_ptr[key_idx_block];
    int src_lane = sm_ranks[groupID][same_buf(loop_num)];
    __pipeline_wait_prior(1);
    int succ = 0;
    if (rank == src_lane) {
      bool CAS_res_cur = same_buf(loop_num) == 0 ? CAS_res[0] : CAS_res[1];
      succ = CAS_res_cur ? 1 : 0;
    }
    succ = g.shfl(succ, src_lane);
    if (target_pos >= 0 && succ == 1) {
      auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);
      VecV* v_src = tmp + (groupID * 2 + same_buf(loop_num)) * dim;
      VecV* v_dst = value_ptr + target_pos * dim;
      CopyValue::lds_stg(rank, v_dst, v_src, dim);
      ScoreFunctor::update_without_missed(
          keys_ptr, BUCKET_SIZE, target_pos,
          sm_scores[groupID] + same_buf(loop_num), 0, global_epoch);

      auto key_ptr = BUCKET::keys(keys_ptr, target_pos);
      if (rank == 0) {
        auto key_address = BUCKET::keys(keys_ptr, target_pos);
        key_address->store(target_key, cuda::std::memory_order_release);
      }
    }
  }

  /* Pipeline emptying: step4, i = loop_num + 1 */
  {
    int key_idx_block = groupID * GROUP_SIZE + loop_num - 1;
    VecV* value_ptr = sm_values_ptr[key_idx_block];
    int target_pos = sm_position[key_idx_block];
    K target_key = sm_target_keys[key_idx_block];
    K* keys_ptr = sm_keys_ptr[key_idx_block];
    int src_lane = sm_ranks[groupID][same_buf(loop_num + 1)];
    __pipeline_wait_prior(0);
    int succ = 0;
    if (rank == src_lane) {
      bool CAS_res_cur = same_buf(loop_num + 1) == 0 ? CAS_res[0] : CAS_res[1];
      succ = CAS_res_cur ? 1 : 0;
    }
    succ = g.shfl(succ, src_lane);
    if (target_pos >= 0 && succ == 1) {
      auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);
      VecV* v_src = tmp + (groupID * 2 + same_buf(loop_num + 1)) * dim;
      VecV* v_dst = value_ptr + target_pos * dim;
      CopyValue::lds_stg(rank, v_dst, v_src, dim);
      ScoreFunctor::update_without_missed(
          keys_ptr, BUCKET_SIZE, target_pos,
          sm_scores[groupID] + same_buf(loop_num + 1), 0, global_epoch);
      if (rank == 0) {
        auto key_address = BUCKET::keys(keys_ptr, target_pos);
        key_address->store(target_key, cuda::std::memory_order_release);
      }
    }
  }
}  // End function

template <typename K = uint64_t, typename V = float, typename S = uint64_t>
struct Params_Update {
  Params_Update(float load_factor_, Bucket<K, V, S>* __restrict__ buckets_,
                size_t buckets_num_, uint32_t bucket_capacity_, uint32_t dim_,
                const K* __restrict__ keys_, const V* __restrict__ values_,
                const S* __restrict__ scores_, size_t n_, const S global_epoch_)
      : load_factor(load_factor_),
        buckets(buckets_),
        buckets_num(buckets_num_),
        bucket_capacity(bucket_capacity_),
        dim(dim_),
        keys(keys_),
        values(values_),
        scores(scores_),
        n(n_),
        global_epoch(global_epoch_) {}
  float load_factor;
  Bucket<K, V, S>* __restrict__ buckets;
  size_t buckets_num;
  uint32_t bucket_capacity;
  uint32_t dim;
  const K* __restrict__ keys;
  const V* __restrict__ values;
  const S* __restrict__ scores;
  uint64_t n;
  const S global_epoch;
};

template <typename K, typename V, typename S, typename VecV, int Strategy>
struct Launch_TLP_Update {
  using Params = Params_Update<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    tlp_update_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, Strategy>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
            params.buckets, params.buckets_num, params.bucket_capacity,
            params.dim, params.keys,
            reinterpret_cast<const VecV*>(params.values), params.scores,
            params.n, params.global_epoch);
  }
};

template <typename K, typename V, typename S, typename VecV, int Strategy>
struct Launch_Pipeline_Update {
  using Params = Params_Update<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    constexpr uint32_t GROUP_SIZE = 16;
    constexpr uint32_t GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;

    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    uint32_t shared_mem = GROUP_NUM * 2 * params.dim * sizeof(VecV);
    shared_mem =
        (shared_mem + sizeof(byte16) - 1) / sizeof(byte16) * sizeof(byte16);
    pipeline_update_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, GROUP_SIZE,
                                   Strategy>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, shared_mem,
           stream>>>(params.buckets, params.buckets_num, params.dim,
                     params.keys, reinterpret_cast<const VecV*>(params.values),
                     params.scores, params.n, params.global_epoch);
  }
};

template <typename ArchTag>
struct ValueConfig_Update;

/// TODO: support more arch.
template <>
struct ValueConfig_Update<Sm80> {
  // Value size greater than it will bring poor performance for TLP.
  static constexpr uint32_t size_tlp = 8 * sizeof(byte4);
  // Value size greater than it will reduce the occupancy for Pipeline.
  // When the value is very high, the kernel will fail to launch.
  static constexpr uint32_t size_pipeline = 128 * sizeof(byte4);
};

template <>
struct ValueConfig_Update<Sm70> {
  // Value size greater than it will bring poor performance for TLP.
  static constexpr uint32_t size_tlp = 8 * sizeof(byte4);
  // Value size greater than it will reduce the occupancy for Pipeline.
  // When the value is very high, the kernel will fail to launch.
  static constexpr uint32_t size_pipeline = 64 * sizeof(byte4);
};

template <typename K, typename V, typename S, int Strategy, typename ArchTag>
struct KernelSelector_Update {
  using ValueConfig = ValueConfig_Update<ArchTag>;
  using Params = Params_Update<K, V, S>;

  static bool callable(bool unique_key, uint32_t bucket_size, uint32_t dim) {
    constexpr uint32_t MinBucketCap = sizeof(VecD_Load) / sizeof(D);
    if (!unique_key || bucket_size < MinBucketCap) return false;
    uint32_t value_size = dim * sizeof(V);
    if (value_size <= ValueConfig::size_tlp) return true;
    if (bucket_size == 128 && value_size <= ValueConfig::size_pipeline) {
      return true;
    }
    return false;
  }

  static void select_kernel(Params& params, cudaStream_t& stream) {
    const uint32_t total_value_size =
        static_cast<uint32_t>(params.dim * sizeof(V));

    auto launch_TLP = [&]() {
      if (total_value_size % sizeof(byte16) == 0) {
        using VecV = byte16;
        Launch_TLP_Update<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                  stream);
      } else if (total_value_size % sizeof(byte8) == 0) {
        using VecV = byte8;
        Launch_TLP_Update<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                  stream);
      } else if (total_value_size % sizeof(byte4) == 0) {
        using VecV = byte4;
        Launch_TLP_Update<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                  stream);
      } else if (total_value_size % sizeof(byte2) == 0) {
        using VecV = byte2;
        Launch_TLP_Update<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                  stream);
      } else {
        using VecV = byte;
        Launch_TLP_Update<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                  stream);
      }
    };

    auto launch_Pipeline = [&]() {
      if (total_value_size % sizeof(byte16) == 0) {
        using VecV = byte16;
        Launch_Pipeline_Update<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                       stream);
      } else if (total_value_size % sizeof(byte8) == 0) {
        using VecV = byte8;
        Launch_Pipeline_Update<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                       stream);
      } else if (total_value_size % sizeof(byte4) == 0) {
        using VecV = byte4;
        Launch_Pipeline_Update<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                       stream);
      } else if (total_value_size % sizeof(byte2) == 0) {
        using VecV = byte2;
        Launch_Pipeline_Update<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                       stream);
      } else {
        using VecV = byte;
        Launch_Pipeline_Update<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                       stream);
      }
    };
    // This part is according to the test on A100.
    if (params.bucket_capacity != 128) {
      launch_TLP();
    } else {
      if (total_value_size <= ValueConfig::size_tlp) {
        if (params.load_factor <= 0.60f) {
          launch_TLP();
        } else {
          launch_Pipeline();
        }
      } else {
        launch_Pipeline();
      }
    }
  }  // End function
};

/*
 * update with IO operation. This kernel is
 * usually used for the pure HBM mode for better performance.
 */
template <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>
__global__ void update_kernel_with_io(
    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,
    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,
    const K* __restrict keys, const V* __restrict values,
    const S* __restrict scores, const S global_epoch, const size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int* buckets_size = table->buckets_size;

  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_pos = -1;
    size_t key_idx = t / TILE_SIZE;

    const K update_key = keys[key_idx];

    if (IS_RESERVED_KEY<K>(update_key)) continue;

    const V* update_value = values + key_idx * dim;

    size_t bkt_idx = 0;
    size_t start_idx = 0;
    int src_lane = -1;

    Bucket<K, V, S>* bucket = get_key_position<K>(
        buckets, update_key, bkt_idx, start_idx, buckets_num, bucket_max_size);

    OccupyResult occupy_result{OccupyResult::INITIAL};
    const int bucket_size = buckets_size[bkt_idx];

    if (bucket_size >= bucket_max_size) {
      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
    }
    occupy_result = find_and_lock_for_update<K, V, S, TILE_SIZE>(
        g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size);

    occupy_result = g.shfl(occupy_result, src_lane);

    if (occupy_result == OccupyResult::REFUSED) continue;

    if (occupy_result == OccupyResult::DUPLICATE) {
      copy_vector<V, TILE_SIZE>(g, update_value,
                                bucket->vectors + key_pos * dim, dim);
      if (src_lane == g.thread_rank()) {
        ScoreFunctor::update_without_missed(bucket, key_pos, scores, key_idx,
                                            global_epoch);
      }
    }

    if (g.thread_rank() == src_lane) {
      (bucket->keys(key_pos))
          ->store(update_key, cuda::std::memory_order_relaxed);
    }
  }
}

template <typename K, typename V, typename S, int Strategy>
struct SelectUpdateKernelWithIO {
  static void execute_kernel(const float& load_factor, const int& block_size,
                             const size_t bucket_max_size,
                             const size_t buckets_num, const size_t dim,
                             cudaStream_t& stream, const size_t& n,
                             const Table<K, V, S>* __restrict table,
                             Bucket<K, V, S>* buckets, const K* __restrict keys,
                             const V* __restrict values,
                             const S* __restrict scores, const S global_epoch) {
    if (load_factor <= 0.75) {
      const unsigned int tile_size = 4;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      update_kernel_with_io<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, global_epoch, N);
    } else {
      const unsigned int tile_size = 32;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      update_kernel_with_io<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, global_epoch, N);
    }
    return;
  }
};

// Use 1 thread to deal with a KV-pair, including copying value.
template <typename K, typename V, typename S, int Strategy = -1>
__global__ void tlp_update_kernel_hybrid(
    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,
    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,
    V** __restrict__ values, const S* __restrict__ scores,
    K** __restrict__ key_ptrs, int* __restrict src_offset, const S global_epoch,
    uint64_t n) {
  using BUCKET = Bucket<K, V, S>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  V* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  uint32_t key_pos = {0};
  if (kv_idx < n) {
    key = keys[kv_idx];
    if (src_offset) src_offset[kv_idx] = kv_idx;
    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = bucket->vectors;
    } else {
      key_ptrs[kv_idx] = nullptr;
      return;
    }
  } else {
    return;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        key_pos = possible_pos;
        ScoreFunctor::update_without_missed(bucket_keys_ptr, bucket_capacity,
                                            key_pos, scores, kv_idx,
                                            global_epoch);
        V* bucket_value_ptr = bucket_values_ptr + key_pos * dim;
        values[kv_idx] = bucket_value_ptr;
        key_ptrs[kv_idx] = bucket_keys_ptr + key_pos;
        return;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        auto probe_key = current_key->load(cuda::std::memory_order_relaxed);
        if (probe_key == static_cast<K>(EMPTY_KEY)) {
          return;
        }
      } while (true);
    }
  }
}

template <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>
__global__ void update_kernel(const Table<K, V, S>* __restrict table,
                              Bucket<K, V, S>* buckets,
                              const size_t bucket_max_size,
                              const size_t buckets_num, const size_t dim,
                              const K* __restrict keys, V** __restrict vectors,
                              const S* __restrict scores,
                              int* __restrict src_offset, const S global_epoch,
                              size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int* buckets_size = table->buckets_size;

  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_pos = -1;
    size_t key_idx = t / TILE_SIZE;

    const K update_key = keys[key_idx];

    if (IS_RESERVED_KEY<K>(update_key)) continue;

    size_t bkt_idx = 0;
    size_t start_idx = 0;
    int src_lane = -1;

    Bucket<K, V, S>* bucket = get_key_position<K>(
        buckets, update_key, bkt_idx, start_idx, buckets_num, bucket_max_size);

    OccupyResult occupy_result{OccupyResult::INITIAL};
    const int bucket_size = buckets_size[bkt_idx];
    *(src_offset + key_idx) = key_idx;

    if (bucket_size >= bucket_max_size) {
      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
    }
    occupy_result = find_and_lock_for_update<K, V, S, TILE_SIZE>(
        g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size);

    occupy_result = g.shfl(occupy_result, src_lane);

    if (occupy_result == OccupyResult::REFUSED) continue;

    if (g.thread_rank() == src_lane) {
      if (occupy_result == OccupyResult::DUPLICATE) {
        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
        ScoreFunctor::update_without_missed(bucket, key_pos, scores, key_idx,
                                            global_epoch);
      } else {
        *(vectors + key_idx) = nullptr;
      }
    }

    if (g.thread_rank() == src_lane) {
      (bucket->keys(key_pos))
          ->store(update_key, cuda::std::memory_order_relaxed);
    }
  }
}

}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/core_kernels/update_score.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "kernel_utils.cuh"

namespace nv {
namespace merlin {

// Use 1 thread to deal with a KV-pair, including copying value.
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          uint32_t BLOCK_SIZE = 128, int Strategy = -1>
__global__ void tlp_update_score_kernel(Bucket<K, V, S>* __restrict__ buckets,
                                        const uint64_t buckets_num,
                                        uint32_t bucket_capacity,
                                        const K* __restrict__ keys,
                                        const S* __restrict__ scores,
                                        uint64_t n, const S global_epoch) {
  using BUCKET = Bucket<K, V, S>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  K* bucket_keys_ptr{nullptr};
  uint32_t key_pos = {0};
  if (kv_idx < n) {
    key = keys[kv_idx];

    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = global_idx & (bucket_capacity - 1);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
    } else {
      return;
    }
  } else {
    return;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::DUPLICATE;
        key_pos = possible_pos;
        ScoreFunctor::update_without_missed(bucket_keys_ptr, bucket_capacity,
                                            key_pos, scores, kv_idx,
                                            global_epoch);
        auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
        // memory_order_release:
        // Modifications to the bucket will not after this instruction.
        key_address->store(key, cuda::std::memory_order_release);
        return;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        auto probe_key = current_key->load(cuda::std::memory_order_relaxed);
        if (probe_key == static_cast<K>(EMPTY_KEY)) {
          return;
        }
      } while (true);
    }
  }
}
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          uint32_t BLOCK_SIZE = 128, uint32_t GROUP_SIZE = 16,
          int Strategy = -1>
__global__ void pipeline_update_score_kernel(
    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,
    const K* __restrict__ keys, const S* __restrict__ scores, uint64_t n,
    const S global_epoch) {
  constexpr uint32_t BUCKET_SIZE = 128;
  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;
  constexpr uint32_t Comp_LEN = sizeof(VecD_Comp) / sizeof(D);
  // Here, GROUP_SIZE * Load_LEN = BUCKET_SIZE.
  using VecD_Load = byte8;
  constexpr uint32_t Load_LEN = sizeof(VecD_Load) / sizeof(D);
  constexpr int RESERVE = 8;

  using BUCKET = Bucket<K, V, S>;
  using CopyScore = CopyScoreByPassCache<S, K, BUCKET_SIZE>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  __shared__ VecD_Comp sm_target_digests[BLOCK_SIZE];
  __shared__ K sm_target_keys[BLOCK_SIZE];
  __shared__ K* sm_keys_ptr[BLOCK_SIZE];
  // Reuse
  int* sm_counts = reinterpret_cast<int*>(sm_target_digests);
  int* sm_position = sm_counts;
  // Double buffer
  __shared__ D sm_digests[GROUP_NUM][2 * BUCKET_SIZE];
  __shared__ K sm_possible_keys[GROUP_NUM][2 * RESERVE];
  __shared__ int sm_possible_pos[GROUP_NUM][2 * RESERVE];
  __shared__ S sm_scores[GROUP_NUM][2];
  __shared__ int sm_ranks[GROUP_NUM][2];

  bool CAS_res[2]{false};

  // Initialization
  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());
  int groupID = threadIdx.x / GROUP_SIZE;
  int rank = g.thread_rank();
  uint64_t key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;
  if (key_idx_base >= n) return;
  int loop_num =
      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;
  if (rank < loop_num) {
    int idx_block = groupID * GROUP_SIZE + rank;
    K key = keys[key_idx_base + rank];
    sm_target_keys[idx_block] = key;
    const K hashed_key = Murmur3HashDevice(key);
    sm_target_digests[idx_block] = digests_from_hashed<K>(hashed_key);
    uint64_t global_idx = hashed_key % (buckets_num * BUCKET_SIZE);
    uint64_t bkt_idx = global_idx / BUCKET_SIZE;
    Bucket<K, V, S>* bucket = buckets + bkt_idx;
    __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(),
                            sizeof(K*));
    __pipeline_commit();
  }
  __pipeline_wait_prior(0);

  // Pipeline loading
  K* keys_ptr = sm_keys_ptr[groupID * GROUP_SIZE];
  D* digests_ptr = BUCKET::digests(keys_ptr, BUCKET_SIZE, rank * Load_LEN);
  __pipeline_memcpy_async(sm_digests[groupID] + rank * Load_LEN, digests_ptr,
                          sizeof(VecD_Load));
  __pipeline_commit();
  // Padding, meet the param of the first `__pipeline_wait_prior`
  // in the first loop.
  __pipeline_commit();
  __pipeline_commit();

  for (int i = 0; i < loop_num; i++) {
    int key_idx_block = groupID * GROUP_SIZE + i;

    /* Step1: prefetch all digests in one bucket */
    if ((i + 1) < loop_num) {
      K* keys_ptr = sm_keys_ptr[key_idx_block + 1];
      D* digests_ptr = BUCKET::digests(keys_ptr, BUCKET_SIZE, rank * Load_LEN);
      __pipeline_memcpy_async(
          sm_digests[groupID] + diff_buf(i) * BUCKET_SIZE + rank * Load_LEN,
          digests_ptr, sizeof(VecD_Load));
    }
    __pipeline_commit();

    /* Step2: check digests and load possible keys */
    VecD_Comp target_digests = sm_target_digests[key_idx_block];
    sm_counts[key_idx_block] = 0;
    __pipeline_wait_prior(3);
    VecD_Comp probing_digests = *reinterpret_cast<VecD_Comp*>(
        &sm_digests[groupID][same_buf(i) * BUCKET_SIZE + rank * Comp_LEN]);
    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests);
    uint32_t find_result = 0;
    if ((find_result_ & 0x01) != 0) find_result |= 0x01;
    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;
    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;
    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;
    probing_digests = *reinterpret_cast<VecD_Comp*>(
        &sm_digests[groupID][same_buf(i) * BUCKET_SIZE +
                             (GROUP_SIZE + rank) * Comp_LEN]);
    find_result_ = __vcmpeq4(probing_digests, target_digests);
    if ((find_result_ & 0x01) != 0) find_result |= 0x10;
    if ((find_result_ & 0x0100) != 0) find_result |= 0x20;
    if ((find_result_ & 0x010000) != 0) find_result |= 0x40;
    if ((find_result_ & 0x01000000) != 0) find_result |= 0x80;
    int find_number = __popc(find_result);
    int group_base = 0;
    if (find_number > 0) {
      group_base = atomicAdd(sm_counts + key_idx_block, find_number);
    }
    bool gt_reserve = (group_base + find_number) > RESERVE;
    int gt_vote = g.ballot(gt_reserve);
    K* key_ptr = sm_keys_ptr[key_idx_block];
    if (gt_vote == 0) {
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = digest_idx < 4
                            ? (rank * 4 + digest_idx)
                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);
          sm_possible_pos[groupID][same_buf(i) * RESERVE + group_base] =
              key_pos;
          __pipeline_memcpy_async(
              sm_possible_keys[groupID] + same_buf(i) * RESERVE + group_base,
              key_ptr + key_pos, sizeof(K));
          group_base += 1;
        } else {
          break;
        }
      } while (true);
    } else {
      K target_key = sm_target_keys[key_idx_block];
      sm_counts[key_idx_block] = 0;
      int found_vote = 0;
      bool found = false;
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = digest_idx < 4
                            ? (rank * 4 + digest_idx)
                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);
          K possible_key = key_ptr[key_pos];
          if (possible_key == target_key) {
            found = true;
            sm_counts[key_idx_block] = 1;
            sm_possible_pos[groupID][same_buf(i) * RESERVE] = key_pos;
            sm_possible_keys[groupID][same_buf(i) * RESERVE] = possible_key;
          }
        }
        found_vote = g.ballot(found);
        if (found_vote) {
          break;
        }
        found_vote = digest_idx >= 0;
      } while (g.any(found_vote));
    }
    __pipeline_commit();

    /* Step3: check possible keys, and prefecth the value and score */
    if (i > 0) {
      key_idx_block -= 1;
      K target_key = sm_target_keys[key_idx_block];
      K* keys_ptr = sm_keys_ptr[key_idx_block];
      int possible_num = sm_counts[key_idx_block];
      sm_position[key_idx_block] = -1;
      __pipeline_wait_prior(3);
      int key_pos;
      bool found_flag = false;
      if (rank < possible_num) {
        K possible_key =
            sm_possible_keys[groupID][diff_buf(i) * RESERVE + rank];
        key_pos = sm_possible_pos[groupID][diff_buf(i) * RESERVE + rank];
        if (possible_key == target_key) {
          found_flag = true;
          if (scores) {
            int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
            const S* score_ptr = scores + key_idx_grid;
            CopyScore::ldg_sts(sm_scores[groupID] + diff_buf(i), score_ptr);
          }
          auto key_ptr = BUCKET::keys(keys_ptr, key_pos);
          sm_ranks[groupID][diff_buf(i)] = rank;
          if (diff_buf(i) == 0) {
            CAS_res[0] = key_ptr->compare_exchange_strong(
                possible_key, static_cast<K>(LOCKED_KEY),
                cuda::std::memory_order_acquire,
                cuda::std::memory_order_relaxed);
          } else {
            CAS_res[1] = key_ptr->compare_exchange_strong(
                possible_key, static_cast<K>(LOCKED_KEY),
                cuda::std::memory_order_acquire,
                cuda::std::memory_order_relaxed);
          }
        }
      }
      int found_vote = g.ballot(found_flag);
      if (found_vote) {
        int src_lane = __ffs(found_vote) - 1;
        int target_pos = g.shfl(key_pos, src_lane);
        sm_position[key_idx_block] = target_pos;
      }
    }
    __pipeline_commit();

    /* Step4: write back value and score */
    if (i > 1) {
      key_idx_block -= 1;
      int target_pos = sm_position[key_idx_block];
      K target_key = sm_target_keys[key_idx_block];
      K* keys_ptr = sm_keys_ptr[key_idx_block];
      int src_lane = sm_ranks[groupID][same_buf(i)];
      __pipeline_wait_prior(3);
      int succ = 0;
      if (rank == src_lane) {
        bool CAS_res_cur = same_buf(i) == 0 ? CAS_res[0] : CAS_res[1];
        succ = CAS_res_cur ? 1 : 0;
      }
      succ = g.shfl(succ, src_lane);
      if (target_pos >= 0 && succ == 1) {
        ScoreFunctor::update_without_missed(keys_ptr, BUCKET_SIZE, target_pos,
                                            sm_scores[groupID] + same_buf(i), 0,
                                            global_epoch);
        if (rank == 0) {
          auto key_address = BUCKET::keys(keys_ptr, target_pos);
          key_address->store(target_key, cuda::std::memory_order_release);
        }
      }
    }
  }  // End loop

  /* Pipeline emptying: step3, i = loop_num */
  {
    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);
    K target_key = sm_target_keys[key_idx_block];
    K* keys_ptr = sm_keys_ptr[key_idx_block];
    int possible_num = sm_counts[key_idx_block];
    sm_position[key_idx_block] = -1;
    __pipeline_wait_prior(1);
    int key_pos;
    bool found_flag = false;
    if (rank < possible_num) {
      K possible_key =
          sm_possible_keys[groupID][diff_buf(loop_num) * RESERVE + rank];
      key_pos = sm_possible_pos[groupID][diff_buf(loop_num) * RESERVE + rank];
      if (possible_key == target_key) {
        found_flag = true;
        if (scores) {
          int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
          const S* score_ptr = scores + key_idx_grid;
          CopyScore::ldg_sts(sm_scores[groupID] + diff_buf(loop_num),
                             score_ptr);
        }
        auto key_ptr = BUCKET::keys(keys_ptr, key_pos);
        sm_ranks[groupID][diff_buf(loop_num)] = rank;
        if (diff_buf(loop_num) == 0) {
          CAS_res[0] = key_ptr->compare_exchange_strong(
              possible_key, static_cast<K>(LOCKED_KEY),
              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
        } else {
          CAS_res[1] = key_ptr->compare_exchange_strong(
              possible_key, static_cast<K>(LOCKED_KEY),
              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
        }
      }
    }
    int found_vote = g.ballot(found_flag);
    if (found_vote) {
      int src_lane = __ffs(found_vote) - 1;
      int target_pos = g.shfl(key_pos, src_lane);
      sm_position[key_idx_block] = target_pos;
    }
  }
  __pipeline_commit();

  /* Pipeline emptying: step4, i = loop_num */
  if (loop_num > 1) {
    int key_idx_block = groupID * GROUP_SIZE + loop_num - 2;
    int target_pos = sm_position[key_idx_block];
    K target_key = sm_target_keys[key_idx_block];
    K* keys_ptr = sm_keys_ptr[key_idx_block];
    int src_lane = sm_ranks[groupID][same_buf(loop_num)];
    __pipeline_wait_prior(1);
    int succ = 0;
    if (rank == src_lane) {
      bool CAS_res_cur = same_buf(loop_num) == 0 ? CAS_res[0] : CAS_res[1];
      succ = CAS_res_cur ? 1 : 0;
    }
    succ = g.shfl(succ, src_lane);
    if (target_pos >= 0 && succ == 1) {
      ScoreFunctor::update_without_missed(
          keys_ptr, BUCKET_SIZE, target_pos,
          sm_scores[groupID] + same_buf(loop_num), 0, global_epoch);

      auto key_ptr = BUCKET::keys(keys_ptr, target_pos);
      if (rank == 0) {
        auto key_address = BUCKET::keys(keys_ptr, target_pos);
        key_address->store(target_key, cuda::std::memory_order_release);
      }
    }
  }

  /* Pipeline emptying: step4, i = loop_num + 1 */
  {
    int key_idx_block = groupID * GROUP_SIZE + loop_num - 1;
    int target_pos = sm_position[key_idx_block];
    K target_key = sm_target_keys[key_idx_block];
    K* keys_ptr = sm_keys_ptr[key_idx_block];
    int src_lane = sm_ranks[groupID][same_buf(loop_num + 1)];
    __pipeline_wait_prior(0);
    int succ = 0;
    if (rank == src_lane) {
      bool CAS_res_cur = same_buf(loop_num + 1) == 0 ? CAS_res[0] : CAS_res[1];
      succ = CAS_res_cur ? 1 : 0;
    }
    succ = g.shfl(succ, src_lane);
    if (target_pos >= 0 && succ == 1) {
      ScoreFunctor::update_without_missed(
          keys_ptr, BUCKET_SIZE, target_pos,
          sm_scores[groupID] + same_buf(loop_num + 1), 0, global_epoch);
      if (rank == 0) {
        auto key_address = BUCKET::keys(keys_ptr, target_pos);
        key_address->store(target_key, cuda::std::memory_order_release);
      }
    }
  }
}  // End function

template <typename K = uint64_t, typename V = float, typename S = uint64_t>
struct Params_UpdateScore {
  Params_UpdateScore(float load_factor_, Bucket<K, V, S>* __restrict__ buckets_,
                     size_t buckets_num_, uint32_t bucket_capacity_,
                     const K* __restrict__ keys_, const S* __restrict__ scores_,
                     size_t n_, const S global_epoch_)
      : load_factor(load_factor_),
        buckets(buckets_),
        buckets_num(buckets_num_),
        bucket_capacity(bucket_capacity_),
        keys(keys_),
        scores(scores_),
        n(n_),
        global_epoch(global_epoch_) {}
  float load_factor;
  Bucket<K, V, S>* __restrict__ buckets;
  size_t buckets_num;
  uint32_t bucket_capacity;
  const K* __restrict__ keys;
  const S* __restrict__ scores;
  uint64_t n;
  const S global_epoch;
};

template <typename K, typename V, typename S, int Strategy>
struct Launch_TLP_UpdateScore {
  using Params = Params_UpdateScore<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    tlp_update_score_kernel<K, V, S, BLOCK_SIZE, Strategy>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
            params.buckets, params.buckets_num, params.bucket_capacity,
            params.keys, params.scores, params.n, params.global_epoch);
  }
};

template <typename K, typename V, typename S, int Strategy>
struct Launch_Pipeline_UpdateScore {
  using Params = Params_UpdateScore<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    constexpr uint32_t GROUP_SIZE = 16;

    pipeline_update_score_kernel<K, V, S, BLOCK_SIZE, GROUP_SIZE, Strategy>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
            params.buckets, params.buckets_num, params.keys, params.scores,
            params.n, params.global_epoch);
  }
};

template <typename K, typename V, typename S, int Strategy>
struct KernelSelector_UpdateScore {
  using Params = Params_UpdateScore<K, V, S>;

  static bool callable(bool unique_key, uint32_t bucket_size) {
    constexpr uint32_t MinBucketCap = sizeof(VecD_Load) / sizeof(D);
    return (unique_key && bucket_size >= MinBucketCap);
  }

  static void select_kernel(Params& params, cudaStream_t& stream) {
    // This part is according to the test on A100.
    if (params.bucket_capacity != 128) {
      Launch_TLP_UpdateScore<K, V, S, Strategy>::launch_kernel(params, stream);
    } else {
      if (params.load_factor <= 0.60f) {
        Launch_TLP_UpdateScore<K, V, S, Strategy>::launch_kernel(params,
                                                                 stream);
      } else {
        Launch_Pipeline_UpdateScore<K, V, S, Strategy>::launch_kernel(params,
                                                                      stream);
      }
    }
  }  // End function
};

/*
 * update with IO operation. This kernel is
 * usually used for the pure HBM mode for better performance.
 */
template <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>
__global__ void update_score_kernel(const Table<K, V, S>* __restrict table,
                                    Bucket<K, V, S>* buckets,
                                    const size_t bucket_max_size,
                                    const size_t buckets_num,
                                    const K* __restrict keys,
                                    const S* __restrict scores,
                                    const S global_epoch, const size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int* buckets_size = table->buckets_size;

  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_pos = -1;
    size_t key_idx = t / TILE_SIZE;

    const K update_key = keys[key_idx];

    if (IS_RESERVED_KEY<K>(update_key)) continue;

    size_t bkt_idx = 0;
    size_t start_idx = 0;
    int src_lane = -1;

    Bucket<K, V, S>* bucket = get_key_position<K>(
        buckets, update_key, bkt_idx, start_idx, buckets_num, bucket_max_size);

    OccupyResult occupy_result{OccupyResult::INITIAL};
    const int bucket_size = buckets_size[bkt_idx];

    if (bucket_size >= bucket_max_size) {
      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
    }
    occupy_result = find_and_lock_for_update<K, V, S, TILE_SIZE>(
        g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size);

    occupy_result = g.shfl(occupy_result, src_lane);

    if (occupy_result == OccupyResult::REFUSED) continue;

    if (occupy_result == OccupyResult::DUPLICATE) {
      if (src_lane == g.thread_rank()) {
        ScoreFunctor::update_without_missed(bucket, key_pos, scores, key_idx,
                                            global_epoch);
      }
    }

    if (g.thread_rank() == src_lane) {
      (bucket->keys(key_pos))
          ->store(update_key, cuda::std::memory_order_relaxed);
    }
  }
}

template <typename K, typename V, typename S, int Strategy>
struct SelectUpdateScoreKernel {
  static void execute_kernel(const float& load_factor, const int& block_size,
                             const size_t bucket_max_size,
                             const size_t buckets_num, cudaStream_t& stream,
                             const size_t& n,
                             const Table<K, V, S>* __restrict table,
                             Bucket<K, V, S>* buckets, const K* __restrict keys,
                             const S* __restrict scores, const S global_epoch) {
    if (load_factor <= 0.75) {
      const unsigned int tile_size = 4;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      update_score_kernel<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(table, buckets,
                                                 bucket_max_size, buckets_num,
                                                 keys, scores, global_epoch, N);
    } else {
      const unsigned int tile_size = 32;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      update_score_kernel<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(table, buckets,
                                                 bucket_max_size, buckets_num,
                                                 keys, scores, global_epoch, N);
    }
    return;
  }
};

}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/core_kernels/update_values.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "kernel_utils.cuh"

namespace nv {
namespace merlin {

// Use 1 thread to deal with a KV-pair, including copying value.
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          typename VecV = byte16, uint32_t BLOCK_SIZE = 128>
__global__ void tlp_update_values_kernel_with_io(
    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,
    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,
    const VecV* __restrict__ values, uint64_t n) {
  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, 1>;

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  VecV* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  uint32_t key_pos = {0};
  if (kv_idx < n) {
    key = keys[kv_idx];

    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);
    } else {
      return;
    }
  } else {
    return;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::DUPLICATE;
        key_pos = possible_pos;
        VecV* bucket_value_ptr = bucket_values_ptr + key_pos * dim;
        const VecV* param_value_ptr = values + kv_idx * dim;
        CopyValue::ldg_stg(0, bucket_value_ptr, param_value_ptr, dim);
        auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
        // memory_order_release:
        // Modifications to the bucket will not after this instruction.
        key_address->store(key, cuda::std::memory_order_release);
        return;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        auto probe_key = current_key->load(cuda::std::memory_order_relaxed);
        if (probe_key == static_cast<K>(EMPTY_KEY)) {
          return;
        }
      } while (true);
    }
  }
}
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          typename VecV = byte16, uint32_t BLOCK_SIZE = 128,
          uint32_t GROUP_SIZE = 16>
__global__ void pipeline_update_values_kernel_with_io(
    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,
    const uint32_t dim, const K* __restrict__ keys,
    const VecV* __restrict__ values, uint64_t n) {
  constexpr uint32_t BUCKET_SIZE = 128;
  constexpr int GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;
  constexpr uint32_t Comp_LEN = sizeof(VecD_Comp) / sizeof(D);
  // Here, GROUP_SIZE * Load_LEN = BUCKET_SIZE.
  using VecD_Load = byte8;
  constexpr uint32_t Load_LEN = sizeof(VecD_Load) / sizeof(D);
  constexpr int RESERVE = 8;

  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;

  __shared__ VecD_Comp sm_target_digests[BLOCK_SIZE];
  __shared__ K sm_target_keys[BLOCK_SIZE];
  __shared__ K* sm_keys_ptr[BLOCK_SIZE];
  __shared__ VecV* sm_values_ptr[BLOCK_SIZE];
  // Reuse
  int* sm_counts = reinterpret_cast<int*>(sm_target_digests);
  int* sm_position = sm_counts;
  // Double buffer
  __shared__ D sm_digests[GROUP_NUM][2 * BUCKET_SIZE];
  __shared__ K sm_possible_keys[GROUP_NUM][2 * RESERVE];
  __shared__ int sm_possible_pos[GROUP_NUM][2 * RESERVE];
  __shared__ int sm_ranks[GROUP_NUM][2];
  // __shared__ VecV sm_values_buffer[GROUP_NUM][2 * dim];

  extern __shared__ __align__(alignof(byte16)) byte sm_values_buffer[];

  bool CAS_res[2]{false};

  // Initialization
  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());
  int groupID = threadIdx.x / GROUP_SIZE;
  int rank = g.thread_rank();
  uint64_t key_idx_base = (blockIdx.x * blockDim.x) + groupID * GROUP_SIZE;
  if (key_idx_base >= n) return;
  int loop_num =
      (n - key_idx_base) < GROUP_SIZE ? (n - key_idx_base) : GROUP_SIZE;
  if (rank < loop_num) {
    int idx_block = groupID * GROUP_SIZE + rank;
    K key = keys[key_idx_base + rank];
    sm_target_keys[idx_block] = key;
    const K hashed_key = Murmur3HashDevice(key);
    sm_target_digests[idx_block] = digests_from_hashed<K>(hashed_key);
    uint64_t global_idx = hashed_key % (buckets_num * BUCKET_SIZE);
    uint64_t bkt_idx = global_idx / BUCKET_SIZE;
    Bucket<K, V, S>* bucket = buckets + bkt_idx;
    __pipeline_memcpy_async(sm_keys_ptr + idx_block, bucket->keys_addr(),
                            sizeof(K*));
    __pipeline_commit();
    __pipeline_memcpy_async(sm_values_ptr + idx_block, &(bucket->vectors),
                            sizeof(VecV*));
  }
  __pipeline_wait_prior(0);

  // Pipeline loading
  K* keys_ptr = sm_keys_ptr[groupID * GROUP_SIZE];
  D* digests_ptr = BUCKET::digests(keys_ptr, BUCKET_SIZE, rank * Load_LEN);
  __pipeline_memcpy_async(sm_digests[groupID] + rank * Load_LEN, digests_ptr,
                          sizeof(VecD_Load));
  __pipeline_commit();
  // Padding, meet the param of the first `__pipeline_wait_prior`
  // in the first loop.
  __pipeline_commit();
  __pipeline_commit();

  for (int i = 0; i < loop_num; i++) {
    int key_idx_block = groupID * GROUP_SIZE + i;

    /* Step1: prefetch all digests in one bucket */
    if ((i + 1) < loop_num) {
      K* keys_ptr = sm_keys_ptr[key_idx_block + 1];
      D* digests_ptr = BUCKET::digests(keys_ptr, BUCKET_SIZE, rank * Load_LEN);
      __pipeline_memcpy_async(
          sm_digests[groupID] + diff_buf(i) * BUCKET_SIZE + rank * Load_LEN,
          digests_ptr, sizeof(VecD_Load));
    }
    __pipeline_commit();

    /* Step2: check digests and load possible keys */
    VecD_Comp target_digests = sm_target_digests[key_idx_block];
    sm_counts[key_idx_block] = 0;
    __pipeline_wait_prior(3);
    VecD_Comp probing_digests = *reinterpret_cast<VecD_Comp*>(
        &sm_digests[groupID][same_buf(i) * BUCKET_SIZE + rank * Comp_LEN]);
    uint32_t find_result_ = __vcmpeq4(probing_digests, target_digests);
    uint32_t find_result = 0;
    if ((find_result_ & 0x01) != 0) find_result |= 0x01;
    if ((find_result_ & 0x0100) != 0) find_result |= 0x02;
    if ((find_result_ & 0x010000) != 0) find_result |= 0x04;
    if ((find_result_ & 0x01000000) != 0) find_result |= 0x08;
    probing_digests = *reinterpret_cast<VecD_Comp*>(
        &sm_digests[groupID][same_buf(i) * BUCKET_SIZE +
                             (GROUP_SIZE + rank) * Comp_LEN]);
    find_result_ = __vcmpeq4(probing_digests, target_digests);
    if ((find_result_ & 0x01) != 0) find_result |= 0x10;
    if ((find_result_ & 0x0100) != 0) find_result |= 0x20;
    if ((find_result_ & 0x010000) != 0) find_result |= 0x40;
    if ((find_result_ & 0x01000000) != 0) find_result |= 0x80;
    int find_number = __popc(find_result);
    int group_base = 0;
    if (find_number > 0) {
      group_base = atomicAdd(sm_counts + key_idx_block, find_number);
    }
    bool gt_reserve = (group_base + find_number) > RESERVE;
    int gt_vote = g.ballot(gt_reserve);
    K* key_ptr = sm_keys_ptr[key_idx_block];
    if (gt_vote == 0) {
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = digest_idx < 4
                            ? (rank * 4 + digest_idx)
                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);
          sm_possible_pos[groupID][same_buf(i) * RESERVE + group_base] =
              key_pos;
          __pipeline_memcpy_async(
              sm_possible_keys[groupID] + same_buf(i) * RESERVE + group_base,
              key_ptr + key_pos, sizeof(K));
          group_base += 1;
        } else {
          break;
        }
      } while (true);
    } else {
      K target_key = sm_target_keys[key_idx_block];
      sm_counts[key_idx_block] = 0;
      int found_vote = 0;
      bool found = false;
      do {
        int digest_idx = __ffs(find_result) - 1;
        if (digest_idx >= 0) {
          find_result &= (find_result - 1);
          int key_pos = digest_idx < 4
                            ? (rank * 4 + digest_idx)
                            : ((GROUP_SIZE + rank - 1) * 4 + digest_idx);
          K possible_key = key_ptr[key_pos];
          if (possible_key == target_key) {
            found = true;
            sm_counts[key_idx_block] = 1;
            sm_possible_pos[groupID][same_buf(i) * RESERVE] = key_pos;
            sm_possible_keys[groupID][same_buf(i) * RESERVE] = possible_key;
          }
        }
        found_vote = g.ballot(found);
        if (found_vote) {
          break;
        }
        found_vote = digest_idx >= 0;
      } while (g.any(found_vote));
    }
    __pipeline_commit();

    /* Step3: check possible keys, and prefecth the value */
    if (i > 0) {
      key_idx_block -= 1;
      K target_key = sm_target_keys[key_idx_block];
      K* keys_ptr = sm_keys_ptr[key_idx_block];
      int possible_num = sm_counts[key_idx_block];
      sm_position[key_idx_block] = -1;
      __pipeline_wait_prior(3);
      int key_pos;
      bool found_flag = false;
      if (rank < possible_num) {
        K possible_key =
            sm_possible_keys[groupID][diff_buf(i) * RESERVE + rank];
        key_pos = sm_possible_pos[groupID][diff_buf(i) * RESERVE + rank];
        if (possible_key == target_key) {
          found_flag = true;
          auto key_ptr = BUCKET::keys(keys_ptr, key_pos);
          sm_ranks[groupID][diff_buf(i)] = rank;
          if (diff_buf(i) == 0) {
            CAS_res[0] = key_ptr->compare_exchange_strong(
                possible_key, static_cast<K>(LOCKED_KEY),
                cuda::std::memory_order_acquire,
                cuda::std::memory_order_relaxed);
          } else {
            CAS_res[1] = key_ptr->compare_exchange_strong(
                possible_key, static_cast<K>(LOCKED_KEY),
                cuda::std::memory_order_acquire,
                cuda::std::memory_order_relaxed);
          }
        }
      }
      int found_vote = g.ballot(found_flag);
      if (found_vote) {
        int src_lane = __ffs(found_vote) - 1;
        int target_pos = g.shfl(key_pos, src_lane);
        sm_position[key_idx_block] = target_pos;
        int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
        const VecV* v_src = values + key_idx_grid * dim;
        auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);
        VecV* v_dst = tmp + (groupID * 2 + diff_buf(i)) * dim;
        CopyValue::ldg_sts(rank, v_dst, v_src, dim);
      }
    }
    __pipeline_commit();

    /* Step4: write back value */
    if (i > 1) {
      key_idx_block -= 1;
      VecV* value_ptr = sm_values_ptr[key_idx_block];
      int target_pos = sm_position[key_idx_block];
      K target_key = sm_target_keys[key_idx_block];
      K* keys_ptr = sm_keys_ptr[key_idx_block];
      int src_lane = sm_ranks[groupID][same_buf(i)];
      __pipeline_wait_prior(3);
      int succ = 0;
      if (rank == src_lane) {
        bool CAS_res_cur = same_buf(i) == 0 ? CAS_res[0] : CAS_res[1];
        succ = CAS_res_cur ? 1 : 0;
      }
      succ = g.shfl(succ, src_lane);
      if (target_pos >= 0 && succ == 1) {
        auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);
        VecV* v_src = tmp + (groupID * 2 + same_buf(i)) * dim;
        VecV* v_dst = value_ptr + target_pos * dim;
        CopyValue::lds_stg(rank, v_dst, v_src, dim);
        if (rank == 0) {
          auto key_address = BUCKET::keys(keys_ptr, target_pos);
          key_address->store(target_key, cuda::std::memory_order_release);
        }
      }
    }
  }  // End loop

  /* Pipeline emptying: step3, i = loop_num */
  {
    int key_idx_block = groupID * GROUP_SIZE + (loop_num - 1);
    K target_key = sm_target_keys[key_idx_block];
    K* keys_ptr = sm_keys_ptr[key_idx_block];
    int possible_num = sm_counts[key_idx_block];
    sm_position[key_idx_block] = -1;
    __pipeline_wait_prior(1);
    int key_pos;
    bool found_flag = false;
    if (rank < possible_num) {
      K possible_key =
          sm_possible_keys[groupID][diff_buf(loop_num) * RESERVE + rank];
      key_pos = sm_possible_pos[groupID][diff_buf(loop_num) * RESERVE + rank];
      if (possible_key == target_key) {
        found_flag = true;
        auto key_ptr = BUCKET::keys(keys_ptr, key_pos);
        sm_ranks[groupID][diff_buf(loop_num)] = rank;
        if (diff_buf(loop_num) == 0) {
          CAS_res[0] = key_ptr->compare_exchange_strong(
              possible_key, static_cast<K>(LOCKED_KEY),
              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
        } else {
          CAS_res[1] = key_ptr->compare_exchange_strong(
              possible_key, static_cast<K>(LOCKED_KEY),
              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
        }
      }
    }
    int found_vote = g.ballot(found_flag);
    if (found_vote) {
      int src_lane = __ffs(found_vote) - 1;
      int target_pos = g.shfl(key_pos, src_lane);
      sm_position[key_idx_block] = target_pos;
      int key_idx_grid = blockIdx.x * blockDim.x + key_idx_block;
      const VecV* v_src = values + key_idx_grid * dim;
      auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);
      VecV* v_dst = tmp + (groupID * 2 + diff_buf(loop_num)) * dim;
      CopyValue::ldg_sts(rank, v_dst, v_src, dim);
    }
  }
  __pipeline_commit();

  /* Pipeline emptying: step4, i = loop_num */
  if (loop_num > 1) {
    int key_idx_block = groupID * GROUP_SIZE + loop_num - 2;
    VecV* value_ptr = sm_values_ptr[key_idx_block];
    int target_pos = sm_position[key_idx_block];
    K target_key = sm_target_keys[key_idx_block];
    K* keys_ptr = sm_keys_ptr[key_idx_block];
    int src_lane = sm_ranks[groupID][same_buf(loop_num)];
    __pipeline_wait_prior(1);
    int succ = 0;
    if (rank == src_lane) {
      bool CAS_res_cur = same_buf(loop_num) == 0 ? CAS_res[0] : CAS_res[1];
      succ = CAS_res_cur ? 1 : 0;
    }
    succ = g.shfl(succ, src_lane);
    if (target_pos >= 0 && succ == 1) {
      auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);
      VecV* v_src = tmp + (groupID * 2 + same_buf(loop_num)) * dim;
      VecV* v_dst = value_ptr + target_pos * dim;
      CopyValue::lds_stg(rank, v_dst, v_src, dim);

      auto key_ptr = BUCKET::keys(keys_ptr, target_pos);
      if (rank == 0) {
        auto key_address = BUCKET::keys(keys_ptr, target_pos);
        key_address->store(target_key, cuda::std::memory_order_release);
      }
    }
  }

  /* Pipeline emptying: step4, i = loop_num + 1 */
  {
    int key_idx_block = groupID * GROUP_SIZE + loop_num - 1;
    VecV* value_ptr = sm_values_ptr[key_idx_block];
    int target_pos = sm_position[key_idx_block];
    K target_key = sm_target_keys[key_idx_block];
    K* keys_ptr = sm_keys_ptr[key_idx_block];
    int src_lane = sm_ranks[groupID][same_buf(loop_num + 1)];
    __pipeline_wait_prior(0);
    int succ = 0;
    if (rank == src_lane) {
      bool CAS_res_cur = same_buf(loop_num + 1) == 0 ? CAS_res[0] : CAS_res[1];
      succ = CAS_res_cur ? 1 : 0;
    }
    succ = g.shfl(succ, src_lane);
    if (target_pos >= 0 && succ == 1) {
      auto tmp = reinterpret_cast<VecV*>(sm_values_buffer);
      VecV* v_src = tmp + (groupID * 2 + same_buf(loop_num + 1)) * dim;
      VecV* v_dst = value_ptr + target_pos * dim;
      CopyValue::lds_stg(rank, v_dst, v_src, dim);
      if (rank == 0) {
        auto key_address = BUCKET::keys(keys_ptr, target_pos);
        key_address->store(target_key, cuda::std::memory_order_release);
      }
    }
  }
}  // End function

template <typename K = uint64_t, typename V = float, typename S = uint64_t>
struct Params_UpdateValues {
  Params_UpdateValues(float load_factor_,
                      Bucket<K, V, S>* __restrict__ buckets_,
                      size_t buckets_num_, uint32_t bucket_capacity_,
                      uint32_t dim_, const K* __restrict__ keys_,
                      const V* __restrict__ values_, size_t n_)
      : load_factor(load_factor_),
        buckets(buckets_),
        buckets_num(buckets_num_),
        bucket_capacity(bucket_capacity_),
        dim(dim_),
        keys(keys_),
        values(values_),
        n(n_) {}
  float load_factor;
  Bucket<K, V, S>* __restrict__ buckets;
  size_t buckets_num;
  uint32_t bucket_capacity;
  uint32_t dim;
  const K* __restrict__ keys;
  const V* __restrict__ values;
  uint64_t n;
};

template <typename K, typename V, typename S, typename VecV>
struct Launch_TLP_UpdateValues {
  using Params = Params_UpdateValues<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    tlp_update_values_kernel_with_io<K, V, S, VecV, BLOCK_SIZE>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
            params.buckets, params.buckets_num, params.bucket_capacity,
            params.dim, params.keys,
            reinterpret_cast<const VecV*>(params.values), params.n);
  }
};

template <typename K, typename V, typename S, typename VecV>
struct Launch_Pipeline_UpdateValues {
  using Params = Params_UpdateValues<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    constexpr uint32_t GROUP_SIZE = 16;
    constexpr uint32_t GROUP_NUM = BLOCK_SIZE / GROUP_SIZE;

    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    uint32_t shared_mem = GROUP_NUM * 2 * params.dim * sizeof(VecV);
    shared_mem =
        (shared_mem + sizeof(byte16) - 1) / sizeof(byte16) * sizeof(byte16);
    pipeline_update_values_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, GROUP_SIZE>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, shared_mem,
           stream>>>(params.buckets, params.buckets_num, params.dim,
                     params.keys, reinterpret_cast<const VecV*>(params.values),
                     params.n);
  }
};

template <typename ArchTag>
struct ValueConfig_UpdateValues;

/// TODO: support more arch.
template <>
struct ValueConfig_UpdateValues<Sm80> {
  // Value size greater than it will bring poor performance for TLP.
  static constexpr uint32_t size_tlp = 8 * sizeof(byte4);
  // Value size greater than it will reduce the occupancy for Pipeline.
  // When the value is very high, the kernel will fail to launch.
  static constexpr uint32_t size_pipeline = 128 * sizeof(byte4);
};

template <>
struct ValueConfig_UpdateValues<Sm70> {
  // Value size greater than it will bring poor performance for TLP.
  static constexpr uint32_t size_tlp = 8 * sizeof(byte4);
  // Value size greater than it will reduce the occupancy for Pipeline.
  // When the value is very high, the kernel will fail to launch.
  static constexpr uint32_t size_pipeline = 64 * sizeof(byte4);
};

template <typename K, typename V, typename S, typename ArchTag>
struct KernelSelector_UpdateValues {
  using ValueConfig = ValueConfig_UpdateValues<ArchTag>;
  using Params = Params_UpdateValues<K, V, S>;

  static bool callable(bool unique_key, uint32_t bucket_size, uint32_t dim) {
    constexpr uint32_t MinBucketCap = sizeof(VecD_Load) / sizeof(D);
    if (!unique_key || bucket_size < MinBucketCap) return false;
    uint32_t value_size = dim * sizeof(V);
    if (value_size <= ValueConfig::size_tlp) return true;
    if (bucket_size == 128 && value_size <= ValueConfig::size_pipeline) {
      return true;
    }
    return false;
  }

  static void select_kernel(Params& params, cudaStream_t& stream) {
    const uint32_t total_value_size =
        static_cast<uint32_t>(params.dim * sizeof(V));

    auto launch_TLP = [&]() {
      if (total_value_size % sizeof(byte16) == 0) {
        using VecV = byte16;
        Launch_TLP_UpdateValues<K, V, S, VecV>::launch_kernel(params, stream);
      } else if (total_value_size % sizeof(byte8) == 0) {
        using VecV = byte8;
        Launch_TLP_UpdateValues<K, V, S, VecV>::launch_kernel(params, stream);
      } else if (total_value_size % sizeof(byte4) == 0) {
        using VecV = byte4;
        Launch_TLP_UpdateValues<K, V, S, VecV>::launch_kernel(params, stream);
      } else if (total_value_size % sizeof(byte2) == 0) {
        using VecV = byte2;
        Launch_TLP_UpdateValues<K, V, S, VecV>::launch_kernel(params, stream);
      } else {
        using VecV = byte;
        Launch_TLP_UpdateValues<K, V, S, VecV>::launch_kernel(params, stream);
      }
    };

    auto launch_Pipeline = [&]() {
      if (total_value_size % sizeof(byte16) == 0) {
        using VecV = byte16;
        Launch_Pipeline_UpdateValues<K, V, S, VecV>::launch_kernel(params,
                                                                   stream);
      } else if (total_value_size % sizeof(byte8) == 0) {
        using VecV = byte8;
        Launch_Pipeline_UpdateValues<K, V, S, VecV>::launch_kernel(params,
                                                                   stream);
      } else if (total_value_size % sizeof(byte4) == 0) {
        using VecV = byte4;
        Launch_Pipeline_UpdateValues<K, V, S, VecV>::launch_kernel(params,
                                                                   stream);
      } else if (total_value_size % sizeof(byte2) == 0) {
        using VecV = byte2;
        Launch_Pipeline_UpdateValues<K, V, S, VecV>::launch_kernel(params,
                                                                   stream);
      } else {
        using VecV = byte;
        Launch_Pipeline_UpdateValues<K, V, S, VecV>::launch_kernel(params,
                                                                   stream);
      }
    };
    // This part is according to the test on A100.
    if (params.bucket_capacity != 128) {
      launch_TLP();
    } else {
      if (total_value_size <= ValueConfig::size_tlp) {
        if (params.load_factor <= 0.60f) {
          launch_TLP();
        } else {
          launch_Pipeline();
        }
      } else {
        launch_Pipeline();
      }
    }
  }  // End function
};

/*
 * update with IO operation. This kernel is
 * usually used for the pure HBM mode for better performance.
 */
template <class K, class V, class S, uint32_t TILE_SIZE = 4>
__global__ void update_values_kernel_with_io(
    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,
    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,
    const K* __restrict keys, const V* __restrict values, const size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int* buckets_size = table->buckets_size;

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_pos = -1;
    size_t key_idx = t / TILE_SIZE;

    const K update_key = keys[key_idx];

    if (IS_RESERVED_KEY<K>(update_key)) continue;

    const V* update_value = values + key_idx * dim;

    size_t bkt_idx = 0;
    size_t start_idx = 0;
    int src_lane = -1;

    Bucket<K, V, S>* bucket = get_key_position<K>(
        buckets, update_key, bkt_idx, start_idx, buckets_num, bucket_max_size);

    OccupyResult occupy_result{OccupyResult::INITIAL};
    const int bucket_size = buckets_size[bkt_idx];

    if (bucket_size >= bucket_max_size) {
      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
    }
    occupy_result = find_and_lock_for_update<K, V, S, TILE_SIZE>(
        g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size);

    occupy_result = g.shfl(occupy_result, src_lane);

    if (occupy_result == OccupyResult::REFUSED) continue;

    if (occupy_result == OccupyResult::DUPLICATE) {
      copy_vector<V, TILE_SIZE>(g, update_value,
                                bucket->vectors + key_pos * dim, dim);
    }

    if (g.thread_rank() == src_lane) {
      (bucket->keys(key_pos))
          ->store(update_key, cuda::std::memory_order_relaxed);
    }
  }
}

template <typename K, typename V, typename S>
struct SelectUpdateValuesKernelWithIO {
  static void execute_kernel(const float& load_factor, const int& block_size,
                             const size_t bucket_max_size,
                             const size_t buckets_num, const size_t dim,
                             cudaStream_t& stream, const size_t& n,
                             const Table<K, V, S>* __restrict table,
                             Bucket<K, V, S>* buckets, const K* __restrict keys,
                             const V* __restrict values) {
    if (load_factor <= 0.75) {
      const unsigned int tile_size = 4;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      update_values_kernel_with_io<K, V, S, tile_size>
          <<<grid_size, block_size, 0, stream>>>(table, buckets,
                                                 bucket_max_size, buckets_num,
                                                 dim, keys, values, N);
    } else {
      const unsigned int tile_size = 32;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      update_values_kernel_with_io<K, V, S, tile_size>
          <<<grid_size, block_size, 0, stream>>>(table, buckets,
                                                 bucket_max_size, buckets_num,
                                                 dim, keys, values, N);
    }
    return;
  }
};

// Use 1 thread to deal with a KV-pair, including copying value.
template <typename K, typename V, typename S>
__global__ void tlp_update_values_kernel_hybrid(
    Bucket<K, V, S>* __restrict__ buckets, const uint64_t buckets_num,
    uint32_t bucket_capacity, const uint32_t dim, const K* __restrict__ keys,
    V** __restrict__ values, K** __restrict__ key_ptrs,
    int* __restrict src_offset, uint64_t n) {
  using BUCKET = Bucket<K, V, S>;

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  V* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  uint32_t key_pos = {0};
  if (kv_idx < n) {
    key = keys[kv_idx];
    if (src_offset) src_offset[kv_idx] = kv_idx;
    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = bucket->vectors;
    } else {
      key_ptrs[kv_idx] = nullptr;
      return;
    }
  } else {
    return;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        key_pos = possible_pos;
        V* bucket_value_ptr = bucket_values_ptr + key_pos * dim;
        values[kv_idx] = bucket_value_ptr;
        key_ptrs[kv_idx] = bucket_keys_ptr + key_pos;
        return;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        auto probe_key = current_key->load(cuda::std::memory_order_relaxed);
        if (probe_key == static_cast<K>(EMPTY_KEY)) {
          return;
        }
      } while (true);
    }
  }
}

template <class K, class V, class S, uint32_t TILE_SIZE = 4>
__global__ void update_values_kernel(const Table<K, V, S>* __restrict table,
                                     Bucket<K, V, S>* buckets,
                                     const size_t bucket_max_size,
                                     const size_t buckets_num, const size_t dim,
                                     const K* __restrict keys,
                                     V** __restrict vectors,
                                     int* __restrict src_offset, size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int* buckets_size = table->buckets_size;

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_pos = -1;
    size_t key_idx = t / TILE_SIZE;

    const K update_key = keys[key_idx];

    if (IS_RESERVED_KEY<K>(update_key)) continue;

    size_t bkt_idx = 0;
    size_t start_idx = 0;
    int src_lane = -1;

    Bucket<K, V, S>* bucket = get_key_position<K>(
        buckets, update_key, bkt_idx, start_idx, buckets_num, bucket_max_size);

    OccupyResult occupy_result{OccupyResult::INITIAL};
    const int bucket_size = buckets_size[bkt_idx];
    *(src_offset + key_idx) = key_idx;

    if (bucket_size >= bucket_max_size) {
      start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
    }
    occupy_result = find_and_lock_for_update<K, V, S, TILE_SIZE>(
        g, bucket, update_key, start_idx, key_pos, src_lane, bucket_max_size);

    occupy_result = g.shfl(occupy_result, src_lane);

    if (occupy_result == OccupyResult::REFUSED) continue;

    if (g.thread_rank() == src_lane) {
      if (occupy_result == OccupyResult::DUPLICATE) {
        *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
      } else {
        *(vectors + key_idx) = nullptr;
      }
    }

    if (g.thread_rank() == src_lane) {
      (bucket->keys(key_pos))
          ->store(update_key, cuda::std::memory_order_relaxed);
    }
  }
}

}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/core_kernels/upsert.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "kernel_utils.cuh"

namespace nv {
namespace merlin {

// Use 1 thread to deal with a KV-pair, including copying value.
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          typename VecV = byte16, uint32_t BLOCK_SIZE = 128, int Strategy = -1>
__global__ void tlp_v1_upsert_kernel_with_io(
    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,
    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,
    const K* __restrict__ keys, const VecV* __restrict__ values,
    const S* __restrict__ scores, uint64_t n, const S global_epoch) {
  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, 1>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  // bucket_capacity is a multiple of 4.
  constexpr uint32_t STRIDE_S = 4;
  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);
  __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  S score{static_cast<S>(EMPTY_SCORE)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  VecV* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  int32_t* bucket_size_ptr{nullptr};
  uint32_t key_pos = {0};
  uint32_t bucket_size{0};
  if (kv_idx < n) {
    key = keys[kv_idx];
    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);

    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      bucket_size_ptr = buckets_size + bkt_idx;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_size = *bucket_size_ptr;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);
    } else {
      return;
    }
  } else {
    return;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::DUPLICATE;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), false);

        break;
      } else if (bucket_size == bucket_capacity) {
        continue;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = static_cast<K>(EMPTY_KEY);
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::OCCUPIED_EMPTY;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), true);
        atomicAdd(bucket_size_ptr, 1);
        break;
      }
    }
  }
  while (occupy_result == OccupyResult::INITIAL) {
    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);
    S min_score = static_cast<S>(MAX_SCORE);
    int min_pos = -1;
#pragma unroll
    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,
                              sizeof(S) * Load_LEN_S);
    }
    __pipeline_commit();
    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {
      if (i < bucket_capacity - STRIDE_S) {
#pragma unroll
        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
          __pipeline_memcpy_async(
              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,
              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);
        }
      }
      __pipeline_commit();
      __pipeline_wait_prior(1);
      S temp_scores[Load_LEN_S];
      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;
#pragma unroll
      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {
        *reinterpret_cast<byte16*>(temp_scores) =
            *reinterpret_cast<byte16*>(src + k);
#pragma unroll
        for (int j = 0; j < Load_LEN_S; j += 1) {
          S temp_score = temp_scores[j];
          if (temp_score < min_score) {
            min_score = temp_score;
            min_pos = i + k + j;
          }
        }
      }
    }

    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (score < min_score) {
      occupy_result = OccupyResult::REFUSED;
      break;
    }
    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);
    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);
    if (expected_key != static_cast<K>(LOCKED_KEY) &&
        expected_key != static_cast<K>(EMPTY_KEY)) {
      bool result = min_score_key->compare_exchange_strong(
          expected_key, static_cast<K>(LOCKED_KEY),
          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      if (result) {
        S* min_score_ptr =
            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);
        auto verify_score_ptr =
            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);
        auto verify_score =
            verify_score_ptr->load(cuda::std::memory_order_relaxed);
        if (verify_score <= min_score) {
          key_pos = min_pos;
          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                           kv_idx, score, bucket_capacity,
                                           get_digest<K>(key), true);

          if (expected_key == static_cast<K>(RECLAIM_KEY)) {
            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
            atomicAdd(bucket_size_ptr, 1);
          } else {
            occupy_result = OccupyResult::EVICT;
          }
        } else {
          min_score_key->store(expected_key, cuda::std::memory_order_release);
        }
      }
    }
  }
  VecV* bucket_value_ptr = bucket_values_ptr + key_pos * dim;
  const VecV* param_value_ptr = values + kv_idx * dim;

  if (occupy_result != OccupyResult::REFUSED) {
    CopyValue::ldg_stg(0, bucket_value_ptr, param_value_ptr, dim);
    auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
    // memory_order_release:
    // Modifications to the bucket will not after this instruction.
    key_address->store(key, cuda::std::memory_order_release);
  }
}

// Use 1 thread to deal with a KV-pair, including copying value.
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          typename VecV = byte16, uint32_t BLOCK_SIZE = 128,
          uint32_t GROUP_SIZE = 16, int Strategy = -1>
__global__ void tlp_v2_upsert_kernel_with_io(
    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,
    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,
    const K* __restrict__ keys, const VecV* __restrict__ values,
    const S* __restrict__ scores, uint64_t n, const S global_epoch) {
  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());

  // bucket_capacity is a multiple of 4.
  constexpr uint32_t STRIDE_S = 4;
  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);
  __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  S score{static_cast<S>(EMPTY_SCORE)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  VecV* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  int32_t* bucket_size_ptr{nullptr};
  uint32_t key_pos = {0};
  uint32_t bucket_size{0};
  if (kv_idx < n) {
    key = keys[kv_idx];
    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);

    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      bucket_size_ptr = buckets_size + bkt_idx;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_size = *bucket_size_ptr;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);
    } else {
      occupy_result = OccupyResult::ILLEGAL;
    }
  } else {
    occupy_result = OccupyResult::ILLEGAL;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::DUPLICATE;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), false);
        break;
      } else if (bucket_size == bucket_capacity) {
        continue;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = static_cast<K>(EMPTY_KEY);
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::OCCUPIED_EMPTY;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), true);
        atomicAdd(bucket_size_ptr, 1);
        break;
      }
    }
  }
  while (occupy_result == OccupyResult::INITIAL) {
    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);
    S min_score = static_cast<S>(MAX_SCORE);
    int min_pos = -1;
#pragma unroll
    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,
                              sizeof(S) * Load_LEN_S);
    }
    __pipeline_commit();
    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {
      if (i < bucket_capacity - STRIDE_S) {
#pragma unroll
        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
          __pipeline_memcpy_async(
              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,
              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);
        }
      }
      __pipeline_commit();
      __pipeline_wait_prior(1);
      S temp_scores[Load_LEN_S];
      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;
#pragma unroll
      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {
        *reinterpret_cast<byte16*>(temp_scores) =
            *reinterpret_cast<byte16*>(src + k);
#pragma unroll
        for (int j = 0; j < Load_LEN_S; j += 1) {
          S temp_score = temp_scores[j];
          if (temp_score < min_score) {
            auto verify_key_ptr = BUCKET::keys(bucket_keys_ptr, i + k + j);
            auto verify_key =
                verify_key_ptr->load(cuda::std::memory_order_relaxed);
            if (verify_key != static_cast<K>(LOCKED_KEY) &&
                verify_key != static_cast<K>(EMPTY_KEY)) {
              min_score = temp_score;
              min_pos = i + k + j;
            }
          }
        }
      }
    }

    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (score < min_score) {
      occupy_result = OccupyResult::REFUSED;
      break;
    }
    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);
    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);
    if (expected_key != static_cast<K>(LOCKED_KEY) &&
        expected_key != static_cast<K>(EMPTY_KEY)) {
      bool result = min_score_key->compare_exchange_strong(
          expected_key, static_cast<K>(LOCKED_KEY),
          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      if (result) {
        S* min_score_ptr =
            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);
        auto verify_score_ptr =
            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);
        auto verify_score =
            verify_score_ptr->load(cuda::std::memory_order_relaxed);
        if (verify_score <= min_score) {
          key_pos = min_pos;
          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                           kv_idx, score, bucket_capacity,
                                           get_digest<K>(key), true);
          if (expected_key == static_cast<K>(RECLAIM_KEY)) {
            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
            atomicAdd(bucket_size_ptr, 1);
          } else {
            occupy_result = OccupyResult::EVICT;
          }
        } else {
          min_score_key->store(expected_key, cuda::std::memory_order_release);
        }
      }
    }
  }

  VecV* bucket_value_ptr{nullptr};
  if ((occupy_result != OccupyResult::ILLEGAL) &&
      (occupy_result != OccupyResult::REFUSED)) {
    bucket_value_ptr = bucket_values_ptr + key_pos * dim;
  }
  __syncthreads();
  uint32_t rank = g.thread_rank();
  uint32_t groupID = threadIdx.x / GROUP_SIZE;

  // Shared memory reuse:
  // __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];
  // __shared__ VecV sm_values_buffer[GROUP_NUM][2][GROUP_BUF];
  // assert(GROUP_BUF >= 2 * dim);
  constexpr uint32_t GROUP_BUFs =
      GROUP_SIZE * 2 * STRIDE_S * sizeof(S) / sizeof(VecV);
  constexpr uint32_t GROUP_BUF = GROUP_BUFs / 2;
  auto sm_values_buffer =
      reinterpret_cast<VecV*>(&(sm_bucket_scores[0][0])) + groupID * GROUP_BUFs;

  auto occupy_result_next = g.shfl(occupy_result, 0);
  if ((occupy_result_next != OccupyResult::ILLEGAL) &&
      (occupy_result_next != OccupyResult::REFUSED)) {
    VecV* dst = sm_values_buffer;
    auto kv_idx_next = g.shfl(kv_idx, 0);
    const VecV* src = values + kv_idx_next * dim;
    CopyValue::ldg_sts(rank, dst, src, dim);
  }
  __pipeline_commit();

  for (int i = 0; i < GROUP_SIZE; i++) {
    if (i + 1 < GROUP_SIZE) {
      auto occupy_result_next = g.shfl(occupy_result, i + 1);
      if ((occupy_result_next != OccupyResult::ILLEGAL) &&
          (occupy_result_next != OccupyResult::REFUSED)) {
        VecV* dst = sm_values_buffer + diff_buf(i) * GROUP_BUF;
        auto kv_idx_next = g.shfl(kv_idx, i + 1);
        const VecV* src = values + kv_idx_next * dim;
        CopyValue::ldg_sts(rank, dst, src, dim);
      }
    }
    __pipeline_commit();
    auto occupy_result_cur = g.shfl(occupy_result, i);
    if ((occupy_result_cur != OccupyResult::ILLEGAL) &&
        (occupy_result_cur != OccupyResult::REFUSED)) {
      VecV* src = sm_values_buffer + same_buf(i) * GROUP_BUF;
      __pipeline_wait_prior(0);
      VecV* dst = g.shfl(bucket_value_ptr, i);
      __pipeline_wait_prior(1);
      CopyValue::lds_stg(rank, dst, src, dim);
    }
  }

  if ((occupy_result != OccupyResult::ILLEGAL) &&
      (occupy_result != OccupyResult::REFUSED)) {
    auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
    // memory_order_release:
    // Modifications to the bucket will not after this instruction.
    key_address->store(key, cuda::std::memory_order_release);
  }
}

template <
    typename K, typename V, typename S, typename VecV, uint32_t BLOCK_SIZE,
    uint32_t GROUP_SIZE, uint32_t BUCKET_SIZE,
    uint32_t GROUP_NUM = BLOCK_SIZE / GROUP_SIZE, uint32_t OFST_ParamScores = 0,
    uint32_t OFST_BucketValuesPtr = OFST_ParamScores + sizeof(S) * BLOCK_SIZE,
    uint32_t OFST_BucketsSizePtr =
        OFST_BucketValuesPtr + sizeof(VecV*) * BLOCK_SIZE,
    uint32_t OFST_BucketDigests =
        OFST_BucketsSizePtr + sizeof(int*) * BLOCK_SIZE,
    uint32_t OFST_BucketScores =
        OFST_BucketDigests + sizeof(D) * GROUP_NUM * 2 * BUCKET_SIZE,
    uint32_t OFST_BucketValues =
        OFST_BucketScores + sizeof(S) * GROUP_NUM * 2 * BUCKET_SIZE>
struct SharedMemoryManager_Pipeline_Upsert {
  /*
    __shared__ S sm_param_scores[BLOCK_SIZE];
    __shared__ VecV* sm_bucket_values_ptr[BLOCK_SIZE];
    __shared__ int* sm_buckets_size_ptr[BLOCK_SIZE];
    __shared__ D sm_bucket_digests[GROUP_NUM][2][BUCKET_SIZE];
    __shared__ S sm_bucket_scores[GROUP_NUM][2][BUCKET_SIZE];
    __shared__ VecV sm_values_buffer[GROUP_NUM][2][dim];
  */
  static inline uint32_t total_size(uint32_t dim) {
    return BLOCK_SIZE * (sizeof(S) + sizeof(VecV*) + sizeof(int*)) +
           GROUP_NUM * 2 *
               (BUCKET_SIZE * (sizeof(D) + sizeof(S)) + dim * sizeof(VecV));
  }
  static __forceinline__ __device__ S* param_scores(byte* smem) {
    return reinterpret_cast<S*>(smem + OFST_ParamScores);
  }
  static __forceinline__ __device__ VecV** bucket_values_ptr(byte* smem) {
    return reinterpret_cast<VecV**>(smem + OFST_BucketValuesPtr);
  }
  static __forceinline__ __device__ int** buckets_size_ptr(byte* smem) {
    return reinterpret_cast<int**>(smem + OFST_BucketsSizePtr);
  }
  static __forceinline__ __device__ D* bucket_digests(byte* smem,
                                                      uint32_t groupID,
                                                      uint32_t buf) {
    return reinterpret_cast<D*>(smem + OFST_BucketDigests) +
           BUCKET_SIZE * (groupID * 2 + buf);
  }
  static __forceinline__ __device__ S* bucket_scores(byte* smem,
                                                     uint32_t groupID,
                                                     uint32_t buf) {
    return reinterpret_cast<S*>(smem + OFST_BucketScores) +
           BUCKET_SIZE * (groupID * 2 + buf);
  }
  static __forceinline__ __device__ VecV* values_buffer(byte* smem,
                                                        uint32_t groupID,
                                                        uint32_t buf,
                                                        uint32_t dim) {
    return reinterpret_cast<VecV*>(smem + OFST_BucketValues) +
           dim * (groupID * 2 + buf);
  }
};

template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          typename VecV = byte16, uint32_t BLOCK_SIZE = 128, int Strategy = -1>
__global__ void pipeline_upsert_kernel_with_io(
    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,
    const uint64_t buckets_num, const uint32_t dim, const K* __restrict__ keys,
    const VecV* __restrict__ values, const S* __restrict__ scores, uint64_t n,
    const S global_epoch) {
  // Here, GROUP_SIZE * Comp_LEN = BUCKET_SIZE.
  constexpr uint32_t BUCKET_SIZE = 128;
  constexpr uint32_t GROUP_SIZE = 32;
  constexpr uint32_t Comp_LEN = sizeof(VecD_Comp) / sizeof(D);
  constexpr uint32_t Load_LEN = sizeof(VecD_Load) / sizeof(D);
  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);

  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;
  using SMM = SharedMemoryManager_Pipeline_Upsert<K, V, S, VecV, BLOCK_SIZE,
                                                  GROUP_SIZE, BUCKET_SIZE>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  __shared__ extern __align__(alignof(byte16)) byte smem[];

  // Initialization.
  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());
  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  VecD_Comp target_digests;
  K* bucket_keys_ptr{nullptr};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  uint32_t key_pos = 0;
  if (kv_idx < n) {
    key = keys[kv_idx];
    if (scores != nullptr) {
      S* sm_param_scores = SMM::param_scores(smem);
      __pipeline_memcpy_async(sm_param_scores + tx, scores + kv_idx, sizeof(S));
    }
    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * BUCKET_SIZE));
      uint64_t bkt_idx = global_idx / BUCKET_SIZE;
      key_pos = get_start_position(global_idx, BUCKET_SIZE);
      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
      sm_buckets_size_ptr[tx] = buckets_size + bkt_idx;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
      __pipeline_memcpy_async(sm_bucket_values_ptr + tx, &(bucket->vectors),
                              sizeof(VecV*));
    } else {
      occupy_result = OccupyResult::ILLEGAL;
    }
  } else {
    occupy_result = OccupyResult::ILLEGAL;
  }

  uint32_t rank = g.thread_rank();
  uint32_t groupID = threadIdx.x / GROUP_SIZE;

  // Pipeline loading.
  auto occupy_result_next = g.shfl(occupy_result, 0);
  auto keys_ptr_next = g.shfl(bucket_keys_ptr, 0);
  if (occupy_result_next == OccupyResult::INITIAL) {
    D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, 0);
    D* dst = sm_bucket_digests + rank * Load_LEN;
    D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);
    if (rank * Load_LEN < BUCKET_SIZE) {
      __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));
    }
  }
  __pipeline_commit();
  // Padding, meet the param of the first `__pipeline_wait_prior`
  // in the first loop.
  __pipeline_commit();
  __pipeline_commit();
  for (int32_t i = 0; i < GROUP_SIZE; i++) {
    // Step1: load digests from global memory to shared memory.
    if (i + 1 < GROUP_SIZE) {
      auto occupy_result_next = g.shfl(occupy_result, i + 1);
      auto keys_ptr_next = g.shfl(bucket_keys_ptr, i + 1);
      if (occupy_result_next == OccupyResult::INITIAL) {
        D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, diff_buf(i));
        D* dst = sm_bucket_digests + rank * Load_LEN;
        D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);
        if (rank * Load_LEN < BUCKET_SIZE) {
          __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));
        }
      }
    }
    __pipeline_commit();
    // Step2: to lock the target_key or empty_key by querying digests.
    auto occupy_result_cur = g.shfl(occupy_result, i);
    if (occupy_result_cur == OccupyResult::INITIAL) {
      uint32_t tx_cur = groupID * GROUP_SIZE + i;
      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
      auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];
      K key_cur = g.shfl(key, i);
      auto target_digests_cur = g.shfl(target_digests, i);
      auto start_pos_cur = g.shfl(key_pos, i);
      auto keys_ptr_cur = g.shfl(bucket_keys_ptr, i);
      auto bucket_size_cur = *bucket_size_ptr;
      __pipeline_wait_prior(3);
      D* src = SMM::bucket_digests(smem, groupID, same_buf(i));
      uint32_t start_offset = start_pos_cur / Comp_LEN;
      uint32_t probe_offset =
          Comp_LEN * ((start_offset + rank) & (GROUP_SIZE - 1));
      VecD_Comp probe_digests =
          *reinterpret_cast<VecD_Comp*>(src + probe_offset);
      uint32_t cmp_result = __vcmpeq4(probe_digests, target_digests_cur);
      cmp_result &= 0x01010101;
      uint32_t possible_pos = 0;
      bool result = false;
      do {
        if (cmp_result == 0) break;
        int32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = probe_offset + index;
        auto current_key = BUCKET::keys(keys_ptr_cur, possible_pos);
        K expected_key = key_cur;
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      uint32_t found_vote = g.ballot(result);
      if (found_vote) {
        int32_t src_lane = __ffs(found_vote) - 1;
        possible_pos = g.shfl(possible_pos, src_lane);
        if (rank == i) {
          occupy_result = OccupyResult::DUPLICATE;
          key_pos = possible_pos;
          S* sm_param_scores = SMM::param_scores(smem);
          S score = ScoreFunctor::desired_when_missed(sm_param_scores, tx,
                                                      global_epoch);
          ScoreFunctor::update_with_digest(
              bucket_keys_ptr, key_pos, sm_param_scores, tx, score, BUCKET_SIZE,
              get_digest<K>(key), false);
        }
      } else if (bucket_size_cur < BUCKET_SIZE) {
        VecD_Comp empty_digests_ = empty_digests<K>();
        cmp_result = __vcmpeq4(probe_digests, empty_digests_);
        cmp_result &= 0x01010101;
        for (int32_t offset = 0; offset < GROUP_SIZE; offset += 1) {
          if (rank == offset) {
            do {
              if (cmp_result == 0) break;
              int32_t index = (__ffs(cmp_result) - 1) >> 3;
              cmp_result &= (cmp_result - 1);
              possible_pos = probe_offset + index;
              auto current_key = BUCKET::keys(keys_ptr_cur, possible_pos);
              K expected_key = static_cast<K>(EMPTY_KEY);
              result = current_key->compare_exchange_strong(
                  expected_key, static_cast<K>(LOCKED_KEY),
                  cuda::std::memory_order_acquire,
                  cuda::std::memory_order_relaxed);
            } while (!result);
          }
          uint32_t found_vote = g.ballot(result);
          if (found_vote) {
            int32_t src_lane = __ffs(found_vote) - 1;
            possible_pos = g.shfl(possible_pos, src_lane);
            if (rank == i) {
              occupy_result = OccupyResult::OCCUPIED_EMPTY;
              S* sm_param_scores = SMM::param_scores(smem);
              S score = ScoreFunctor::desired_when_missed(sm_param_scores, tx,
                                                          global_epoch);
              key_pos = possible_pos;
              ScoreFunctor::update_with_digest(
                  bucket_keys_ptr, key_pos, sm_param_scores, tx, score,
                  BUCKET_SIZE, get_digest<K>(key), true);
              atomicAdd(bucket_size_ptr, 1);
            }
            break;
          }
        }
      }
      occupy_result_cur = g.shfl(occupy_result, i);
      if (occupy_result_cur == OccupyResult::INITIAL) {
        S* sm_bucket_scores = SMM::bucket_scores(smem, groupID, same_buf(i));
        S* dst = sm_bucket_scores + rank * Load_LEN_S;
        S* src = BUCKET::scores(keys_ptr_cur, BUCKET_SIZE, rank * Load_LEN_S);
#pragma unroll
        for (int32_t k = 0; k < BUCKET_SIZE; k += GROUP_SIZE * Load_LEN_S) {
          __pipeline_memcpy_async(dst + k, src + k, sizeof(S) * Load_LEN_S);
        }
      }
    }
    __pipeline_commit();
    // Step 3: reduce to get the key with the minimum score.
    if (i > 0) {
      occupy_result_cur = g.shfl(occupy_result, i - 1);
      uint32_t tx_cur = groupID * GROUP_SIZE + i - 1;
      S* sm_param_scores = SMM::param_scores(smem);
      S score_cur = ScoreFunctor::desired_when_missed(sm_param_scores, tx_cur,
                                                      global_epoch);
      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
      auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];
      __pipeline_wait_prior(3);
      S* src = SMM::bucket_scores(smem, groupID, diff_buf(i));
      while (occupy_result_cur == OccupyResult::INITIAL) {
        int min_pos_local = -1;
        S min_score_local = static_cast<S>(MAX_SCORE);
#pragma unroll
        for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {
          S temp_scores[Load_LEN_S];
          *reinterpret_cast<byte16*>(temp_scores) =
              *reinterpret_cast<byte16*>(src + rank * Load_LEN_S + j);
#pragma unroll
          for (int k = 0; k < Load_LEN_S; k++) {
            S temp_score = temp_scores[k];
            if (temp_score < min_score_local) {
              min_score_local = temp_score;
              min_pos_local = rank * Load_LEN_S + j + k;
            }
          }
        }
        const S min_score_global =
            cg::reduce(g, min_score_local, cg::less<S>());
        if (score_cur < min_score_global) {
          if (rank == i - 1) {
            occupy_result = OccupyResult::REFUSED;
          }
          occupy_result_cur = g.shfl(occupy_result, i - 1);
          break;
        }
        uint32_t vote = g.ballot(min_score_local <= min_score_global);
        if (vote) {
          int src_lane = __ffs(vote) - 1;
          int min_pos_global = g.shfl(min_pos_local, src_lane);
          if (rank == i - 1) {
            src[min_pos_global] = static_cast<S>(MAX_SCORE);  // Mark visited.
            auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos_global);
            auto expected_key =
                min_score_key->load(cuda::std::memory_order_relaxed);
            if (expected_key != static_cast<K>(LOCKED_KEY) &&
                expected_key != static_cast<K>(EMPTY_KEY)) {
              bool result = min_score_key->compare_exchange_strong(
                  expected_key, static_cast<K>(LOCKED_KEY),
                  cuda::std::memory_order_acquire,
                  cuda::std::memory_order_relaxed);
              if (result) {
                S* score_ptr = BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE,
                                              min_pos_global);
                auto verify_score_ptr =
                    reinterpret_cast<AtomicScore<S>*>(score_ptr);
                auto verify_score =
                    verify_score_ptr->load(cuda::std::memory_order_relaxed);
                if (verify_score <= min_score_global) {
                  if (expected_key == static_cast<K>(RECLAIM_KEY)) {
                    occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
                    atomicAdd(bucket_size_ptr, 1);
                  } else {
                    occupy_result = OccupyResult::EVICT;
                  }
                  key_pos = min_pos_global;
                  ScoreFunctor::update_with_digest(
                      bucket_keys_ptr, key_pos, sm_param_scores, tx_cur,
                      score_cur, BUCKET_SIZE, get_digest<K>(key), true);
                } else {
                  min_score_key->store(expected_key,
                                       cuda::std::memory_order_release);
                }
              }
            }
          }
          occupy_result_cur = g.shfl(occupy_result, i - 1);
        }
      }
      // Prefetch values to shared memory.
      if (occupy_result_cur != OccupyResult::ILLEGAL &&
          occupy_result_cur != OccupyResult::REFUSED) {
        VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(i), dim);
        auto kv_idx_cur = g.shfl(kv_idx, i - 1);
        const VecV* src = values + kv_idx_cur * dim;
        CopyValue::ldg_sts(rank, dst, src, dim);
      }
    }
    __pipeline_commit();

    // Step 4: write values to bucket or param buffer.
    if (i > 1) {
      occupy_result_cur = g.shfl(occupy_result, i - 2);
      if (occupy_result_cur != OccupyResult::ILLEGAL &&
          occupy_result_cur != OccupyResult::REFUSED) {
        VecV* src = SMM::values_buffer(smem, groupID, same_buf(i), dim);
        VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
        auto bucket_values_ptr =
            sm_bucket_values_ptr[groupID * GROUP_SIZE + i - 2];
        auto key_pos_cur = g.shfl(key_pos, i - 2);
        VecV* dst = bucket_values_ptr + key_pos_cur * dim;
        __pipeline_wait_prior(3);
        CopyValue::lds_stg(rank, dst, src, dim);
        if (rank == i - 2) {
          auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
          key_address->store(key, cuda::std::memory_order_release);
        }
      }
    }
  }
  auto occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);
  uint32_t tx_cur = groupID * GROUP_SIZE + GROUP_SIZE - 1;
  S* sm_param_scores = SMM::param_scores(smem);
  S score_cur =
      ScoreFunctor::desired_when_missed(sm_param_scores, tx_cur, global_epoch);
  int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
  auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];
  __pipeline_wait_prior(1);
  S* src = SMM::bucket_scores(smem, groupID, diff_buf(GROUP_SIZE));
  while (occupy_result_cur == OccupyResult::INITIAL) {
    int min_pos_local = -1;
    S min_score_local = static_cast<S>(MAX_SCORE);
#pragma unroll
    for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {
      S temp_scores[Load_LEN_S];
      *reinterpret_cast<byte16*>(temp_scores) =
          *reinterpret_cast<byte16*>(src + rank * Load_LEN_S + j);
#pragma unroll
      for (int k = 0; k < Load_LEN_S; k++) {
        S temp_score = temp_scores[k];
        if (temp_score < min_score_local) {
          min_score_local = temp_score;
          min_pos_local = rank * Load_LEN_S + j + k;
        }
      }
    }
    const S min_score_global = cg::reduce(g, min_score_local, cg::less<S>());
    if (score_cur < min_score_global) {
      if (rank == GROUP_SIZE - 1) {
        occupy_result = OccupyResult::REFUSED;
      }
      occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);
      break;
    }
    uint32_t vote = g.ballot(min_score_local <= min_score_global);
    if (vote) {
      int src_lane = __ffs(vote) - 1;
      int min_pos_global = g.shfl(min_pos_local, src_lane);
      if (rank == GROUP_SIZE - 1) {
        src[min_pos_global] = static_cast<S>(MAX_SCORE);  // Mark visited.
        auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos_global);
        auto expected_key =
            min_score_key->load(cuda::std::memory_order_relaxed);
        if (expected_key != static_cast<K>(LOCKED_KEY) &&
            expected_key != static_cast<K>(EMPTY_KEY)) {
          auto min_score_ptr =
              BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE, min_pos_global);
          bool result = min_score_key->compare_exchange_strong(
              expected_key, static_cast<K>(LOCKED_KEY),
              cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
          if (result) {
            S* score_ptr =
                BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE, min_pos_global);
            auto verify_score_ptr =
                reinterpret_cast<AtomicScore<S>*>(score_ptr);
            auto verify_score =
                verify_score_ptr->load(cuda::std::memory_order_relaxed);
            if (verify_score <= min_score_global) {
              if (expected_key == static_cast<K>(RECLAIM_KEY)) {
                atomicAdd(bucket_size_ptr, 1);
                occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
              } else {
                occupy_result = OccupyResult::EVICT;
              }
              key_pos = min_pos_global;
              ScoreFunctor::update_with_digest(
                  bucket_keys_ptr, key_pos, sm_param_scores, tx_cur, score_cur,
                  BUCKET_SIZE, get_digest<K>(key), true);
            } else {
              min_score_key->store(expected_key,
                                   cuda::std::memory_order_release);
            }
          }
        }
      }
      occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);
    }
  }
  // Prefetch values to shared memory.
  if (occupy_result_cur != OccupyResult::ILLEGAL &&
      occupy_result_cur != OccupyResult::REFUSED) {
    VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(GROUP_SIZE), dim);
    auto kv_idx_cur = g.shfl(kv_idx, GROUP_SIZE - 1);
    const VecV* src = values + kv_idx_cur * dim;
    CopyValue::ldg_sts(rank, dst, src, dim);
  }
  __pipeline_commit();

  // Step 4: write values to bucket or param buffer.
  occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 2);
  if (occupy_result_cur != OccupyResult::ILLEGAL &&
      occupy_result_cur != OccupyResult::REFUSED) {
    VecV* src = SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE), dim);
    VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
    auto bucket_values_ptr =
        sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 2];
    auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 2);
    VecV* dst = bucket_values_ptr + key_pos_cur * dim;
    __pipeline_wait_prior(1);
    CopyValue::lds_stg(rank, dst, src, dim);
    if (rank == GROUP_SIZE - 2) {
      auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
      key_address->store(key, cuda::std::memory_order_release);
    }
  }

  // Step 4: write values to bucket or param buffer.
  occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);
  if (occupy_result_cur != OccupyResult::ILLEGAL &&
      occupy_result_cur != OccupyResult::REFUSED) {
    VecV* src =
        SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE + 1), dim);
    VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
    auto bucket_values_ptr =
        sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 1];
    auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 1);
    VecV* dst = bucket_values_ptr + key_pos_cur * dim;
    __pipeline_wait_prior(0);
    CopyValue::lds_stg(rank, dst, src, dim);
    if (rank == GROUP_SIZE - 1) {
      auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
      key_address->store(key, cuda::std::memory_order_release);
    }
  }
}

template <typename K = uint64_t, typename V = float, typename S = uint64_t>
struct Params_Upsert {
  Params_Upsert(float load_factor_, Bucket<K, V, S>* __restrict__ buckets_,
                int* buckets_size_, size_t buckets_num_,
                uint32_t bucket_capacity_, uint32_t dim_,
                const K* __restrict__ keys_, const V* __restrict__ values_,
                const S* __restrict__ scores_, size_t n_, const S global_epoch_)
      : load_factor(load_factor_),
        buckets(buckets_),
        buckets_size(buckets_size_),
        buckets_num(buckets_num_),
        bucket_capacity(bucket_capacity_),
        dim(dim_),
        keys(keys_),
        values(values_),
        scores(scores_),
        n(n_),
        global_epoch(global_epoch_) {}
  float load_factor;
  Bucket<K, V, S>* __restrict__ buckets;
  int* buckets_size;
  size_t buckets_num;
  uint32_t bucket_capacity;
  uint32_t dim;
  const K* __restrict__ keys;
  const V* __restrict__ values;
  const S* __restrict__ scores;
  uint64_t n;
  const S global_epoch;
};

template <typename K, typename V, typename S, typename VecV, int Strategy>
struct Launch_TLPv1_Upsert {
  using Params = Params_Upsert<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    tlp_v1_upsert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, Strategy>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
            params.buckets, params.buckets_size, params.buckets_num,
            params.bucket_capacity, params.dim, params.keys,
            reinterpret_cast<const VecV*>(params.values), params.scores,
            params.n, params.global_epoch);
  }
};

template <typename K, typename V, typename S, typename VecV, int Strategy>
struct Launch_TLPv2_Upsert {
  using Params = Params_Upsert<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    const uint32_t value_size = params.dim * sizeof(V);
    params.dim = value_size / sizeof(VecV);

    if (value_size <= 256) {
      constexpr int GROUP_SIZE = 8;
      tlp_v2_upsert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, GROUP_SIZE,
                                   Strategy>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, params.buckets_size, params.buckets_num,
              params.bucket_capacity, params.dim, params.keys,
              reinterpret_cast<const VecV*>(params.values), params.scores,
              params.n, params.global_epoch);
    } else {
      constexpr int GROUP_SIZE = 16;
      tlp_v2_upsert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, GROUP_SIZE,
                                   Strategy>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, params.buckets_size, params.buckets_num,
              params.bucket_capacity, params.dim, params.keys,
              reinterpret_cast<const VecV*>(params.values), params.scores,
              params.n, params.global_epoch);
    }
  }
};

template <typename K, typename V, typename S, typename VecV, int Strategy>
struct Launch_Pipeline_Upsert {
  using Params = Params_Upsert<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    constexpr uint32_t GROUP_SIZE = 32;
    constexpr uint32_t BUCKET_SIZE = 128;
    using SMM = SharedMemoryManager_Pipeline_Upsert<K, V, S, VecV, BLOCK_SIZE,
                                                    GROUP_SIZE, BUCKET_SIZE>;

    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    uint32_t shared_mem = SMM::total_size(params.dim);
    shared_mem =
        (shared_mem + sizeof(byte16) - 1) / sizeof(byte16) * sizeof(byte16);
    pipeline_upsert_kernel_with_io<K, V, S, VecV, BLOCK_SIZE, Strategy>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, shared_mem,
           stream>>>(params.buckets, params.buckets_size, params.buckets_num,
                     params.dim, params.keys,
                     reinterpret_cast<const VecV*>(params.values),
                     params.scores, params.n, params.global_epoch);
  }
};

template <typename ArchTag>
struct ValueConfig_Upsert;

template <>
struct ValueConfig_Upsert<Sm80> {
  // Value size greater than it will bring poor performance for TLPv1.
  static constexpr uint32_t size_tlp_v1 = 8 * sizeof(byte4);
  static constexpr uint32_t size_tlp_v2 = 128 * sizeof(byte4);
};

template <>
struct ValueConfig_Upsert<Sm70> {
  // Value size greater than it will bring poor performance for TLPv1.
  static constexpr uint32_t size_tlp_v1 = 8 * sizeof(byte4);
  static constexpr uint32_t size_tlp_v2 = 128 * sizeof(byte4);
};

template <typename K, typename V, typename S, int Strategy, typename ArchTag>
struct KernelSelector_Upsert {
  using ValueConfig = ValueConfig_Upsert<ArchTag>;
  using Params = Params_Upsert<K, V, S>;

  static bool callable(bool unique_key, uint32_t bucket_size, uint32_t dim) {
    constexpr uint32_t MinBucketCap = sizeof(VecD_Load) / sizeof(D);
    if (!unique_key || bucket_size < MinBucketCap) return false;
    uint32_t value_size = dim * sizeof(V);
#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)
    if (value_size <= ValueConfig::size_tlp_v2) return true;
#else
    if (value_size <= ValueConfig::size_tlp_v1) return true;
#endif
    return false;
  }

  static void select_kernel(Params& params, cudaStream_t& stream) {
    const uint32_t total_value_size =
        static_cast<uint32_t>(params.dim * sizeof(V));

    auto launch_TLPv1 = [&]() {
      if (total_value_size % sizeof(byte16) == 0) {
        using VecV = byte16;
        Launch_TLPv1_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                    stream);
      } else if (total_value_size % sizeof(byte8) == 0) {
        using VecV = byte8;
        Launch_TLPv1_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                    stream);
      } else if (total_value_size % sizeof(byte4) == 0) {
        using VecV = byte4;
        Launch_TLPv1_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                    stream);
      } else if (total_value_size % sizeof(byte2) == 0) {
        using VecV = byte2;
        Launch_TLPv1_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                    stream);
      } else {
        using VecV = byte;
        Launch_TLPv1_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                    stream);
      }
    };

#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)
    auto launch_TLPv2 = [&]() {
      if (total_value_size % sizeof(byte16) == 0) {
        using VecV = byte16;
        Launch_TLPv2_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                    stream);
      } else if (total_value_size % sizeof(byte8) == 0) {
        using VecV = byte8;
        Launch_TLPv2_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                    stream);
      } else if (total_value_size % sizeof(byte4) == 0) {
        using VecV = byte4;
        Launch_TLPv2_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                    stream);
      } else if (total_value_size % sizeof(byte2) == 0) {
        using VecV = byte2;
        Launch_TLPv2_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                    stream);
      } else {
        using VecV = byte;
        Launch_TLPv2_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                    stream);
      }
    };
#endif

    auto launch_Pipeline = [&]() {
      if (total_value_size % sizeof(byte16) == 0) {
        using VecV = byte16;
        Launch_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                       stream);
      } else if (total_value_size % sizeof(byte8) == 0) {
        using VecV = byte8;
        Launch_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                       stream);
      } else if (total_value_size % sizeof(byte4) == 0) {
        using VecV = byte4;
        Launch_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                       stream);
      } else if (total_value_size % sizeof(byte2) == 0) {
        using VecV = byte2;
        Launch_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                       stream);
      } else {
        using VecV = byte;
        Launch_Pipeline_Upsert<K, V, S, VecV, Strategy>::launch_kernel(params,
                                                                       stream);
      }
    };

    // This part is according to the test on A100.
    if (params.bucket_capacity != 128) {
      if (total_value_size <= ValueConfig::size_tlp_v1) {
        launch_TLPv1();
      } else {
#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)
        launch_TLPv2();
#else
        launch_TLPv1();
#endif
      }
    } else {
      if (total_value_size <= ValueConfig::size_tlp_v1) {
        if (params.load_factor <= 0.98f) {
          launch_TLPv1();
        } else {
          launch_Pipeline();
        }
      } else {
        if (params.load_factor <= 0.95f) {
#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)
          launch_TLPv2();
#else
          launch_Pipeline();
#endif
        } else {
          launch_Pipeline();
        }
      }
    }
  }  // End function
};

template <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>
__global__ void upsert_kernel_with_io_core(
    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,
    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,
    const K* __restrict keys, const V* __restrict values,
    const S* __restrict scores, const S global_epoch, size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int* buckets_size = table->buckets_size;

  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_pos = -1;
    size_t key_idx = t / TILE_SIZE;

    const K insert_key = keys[key_idx];

    if (IS_RESERVED_KEY<K>(insert_key)) {
      continue;
    }

    const S insert_score =
        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);
    const V* insert_value = values + key_idx * dim;

    size_t bkt_idx = 0;
    size_t start_idx = 0;
    int src_lane = -1;
    K evicted_key;

    Bucket<K, V, S>* bucket = get_key_position<K>(
        buckets, insert_key, bkt_idx, start_idx, buckets_num, bucket_max_size);

    OccupyResult occupy_result{OccupyResult::INITIAL};
    const int bucket_size = buckets_size[bkt_idx];
    do {
      if (bucket_size < bucket_max_size) {
        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(
            g, bucket, insert_key, insert_score, evicted_key, start_idx,
            key_pos, src_lane, bucket_max_size);
      } else {
        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
        occupy_result = find_and_lock_when_full<K, V, S, TILE_SIZE,
                                                ScoreFunctor::LOCK_MEM_ORDER,
                                                ScoreFunctor::UNLOCK_MEM_ORDER>(
            g, bucket, insert_key, insert_score, evicted_key, start_idx,
            key_pos, src_lane, bucket_max_size);
      }

      occupy_result = g.shfl(occupy_result, src_lane);
    } while (occupy_result == OccupyResult::CONTINUE);

    if (occupy_result == OccupyResult::REFUSED) {
      continue;
    }

    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
        g.thread_rank() == src_lane) {
      atomicAdd(&(buckets_size[bkt_idx]), 1);
    }

    copy_vector<V, TILE_SIZE>(g, insert_value, bucket->vectors + key_pos * dim,
                              dim);
    if (g.thread_rank() == src_lane) {
      ScoreFunctor::update(bucket, key_pos, scores, key_idx, insert_score,
                           (occupy_result != OccupyResult::DUPLICATE));
      bucket->digests(key_pos)[0] = get_digest<K>(insert_key);
      (bucket->keys(key_pos))
          ->store(insert_key, ScoreFunctor::UNLOCK_MEM_ORDER);
    }
  }
}

template <typename K, typename V, typename S, int Strategy>
struct SelectUpsertKernelWithIO {
  static void execute_kernel(const float& load_factor, const int& block_size,
                             const size_t bucket_max_size,
                             const size_t buckets_num, const size_t dim,
                             cudaStream_t& stream, const size_t& n,
                             const Table<K, V, S>* __restrict table,
                             Bucket<K, V, S>* buckets, const K* __restrict keys,
                             const V* __restrict values,
                             const S* __restrict scores, const S global_epoch) {
    if (load_factor <= 0.5) {
      const unsigned int tile_size = 4;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      upsert_kernel_with_io_core<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, global_epoch, N);

    } else if (load_factor <= 0.875) {
      const unsigned int tile_size = 8;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      upsert_kernel_with_io_core<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, global_epoch, N);
    } else {
      const unsigned int tile_size = 32;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      upsert_kernel_with_io_core<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, global_epoch, N);
    }
    return;
  }
};

// Use 1 thread to deal with a KV-pair.
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          uint32_t BLOCK_SIZE = 128, int Strategy = -1>
__global__ void upsert_kernel_lock_key_hybrid(
    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,
    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,
    const K* __restrict__ keys, V** __restrict__ value_ptrs,
    const S* __restrict__ scores, K** __restrict__ key_ptrs,
    int* __restrict keys_index, uint64_t n, const S global_epoch) {
  using BUCKET = Bucket<K, V, S>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  // bucket_capacity is a multiple of 4.
  constexpr uint32_t STRIDE_S = 4;
  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);
  __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  S score{static_cast<S>(EMPTY_SCORE)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  V* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  int32_t* bucket_size_ptr{nullptr};
  uint32_t key_pos = {0};
  uint32_t bucket_size{0};
  if (kv_idx < n) {
    key = keys[kv_idx];

    // help to address the original key after sorting value pointers.
    if (keys_index) {
      keys_index[kv_idx] = kv_idx;
    }

    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      bucket_size_ptr = buckets_size + bkt_idx;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_size = *bucket_size_ptr;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = reinterpret_cast<V*>(bucket->vectors);
    } else {
      key_ptrs[kv_idx] = nullptr;
      value_ptrs[kv_idx] = nullptr;
      return;
    }
  } else {
    return;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::DUPLICATE;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), false);
        break;
      } else if (bucket_size == bucket_capacity) {
        continue;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = static_cast<K>(EMPTY_KEY);
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::OCCUPIED_EMPTY;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), true);
        atomicAdd(bucket_size_ptr, 1);
        break;
      }
    }
  }

  while (occupy_result == OccupyResult::INITIAL) {
    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);
    S min_score = static_cast<S>(MAX_SCORE);
    int min_pos = -1;
#pragma unroll
    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,
                              sizeof(S) * Load_LEN_S);
    }
    __pipeline_commit();
    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {
      if (i < bucket_capacity - STRIDE_S) {
#pragma unroll
        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
          __pipeline_memcpy_async(
              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,
              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);
        }
      }
      __pipeline_commit();
      __pipeline_wait_prior(1);
      S temp_scores[Load_LEN_S];
      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;
#pragma unroll
      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {
        *reinterpret_cast<byte16*>(temp_scores) =
            *reinterpret_cast<byte16*>(src + k);
#pragma unroll
        for (int j = 0; j < Load_LEN_S; j += 1) {
          S temp_score = temp_scores[j];
          if (temp_score < min_score) {
            auto verify_key_ptr = BUCKET::keys(bucket_keys_ptr, i + k + j);
            auto verify_key =
                verify_key_ptr->load(cuda::std::memory_order_relaxed);
            if (verify_key != static_cast<K>(LOCKED_KEY) &&
                verify_key != static_cast<K>(EMPTY_KEY)) {
              min_score = temp_score;
              min_pos = i + k + j;
            }
          }
        }
      }
    }

    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (score <= min_score) {
      occupy_result = OccupyResult::REFUSED;
      break;
    }
    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);
    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);
    if (expected_key != static_cast<K>(LOCKED_KEY) &&
        expected_key != static_cast<K>(EMPTY_KEY)) {
      bool result = min_score_key->compare_exchange_strong(
          expected_key, static_cast<K>(LOCKED_KEY),
          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      if (result) {
        S* min_score_ptr =
            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);
        auto verify_score_ptr =
            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);
        auto verify_score =
            verify_score_ptr->load(cuda::std::memory_order_relaxed);
        if (verify_score <= min_score) {
          key_pos = min_pos;
          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                           kv_idx, score, bucket_capacity,
                                           get_digest<K>(key), true);
          if (expected_key == static_cast<K>(RECLAIM_KEY)) {
            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
            atomicAdd(bucket_size_ptr, 1);
          } else {
            occupy_result = OccupyResult::EVICT;
          }
        } else {
          min_score_key->store(expected_key, cuda::std::memory_order_release);
        }
      }
    }
  }

  if (kv_idx < n) {
    if (occupy_result == OccupyResult::REFUSED) {
      value_ptrs[kv_idx] = nullptr;
      key_ptrs[kv_idx] = nullptr;
    } else {
      value_ptrs[kv_idx] = bucket_values_ptr + key_pos * dim;
      key_ptrs[kv_idx] = bucket_keys_ptr + key_pos;
    }
  }
}

template <class K, class V, class S>
__global__ void write_kernel_unlock_key(const V* __restrict src,
                                        V** __restrict dst,
                                        const int* __restrict src_offset,
                                        const size_t dim, const K* keys,
                                        K** __restrict__ key_ptrs,
                                        const size_t N) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;

  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
    int vec_index = int(t / dim);
    int dim_index = t % dim;
    int real_idx = src_offset ? src_offset[vec_index] : vec_index;

    K* key_ptr = key_ptrs[real_idx];
    K key = keys[real_idx];
    V* value_ptr = dst[vec_index];

    if (key_ptr && dim_index == 0) *key_ptr = key;

    if (value_ptr) {
      value_ptr[dim_index] = src[real_idx * dim + dim_index];
    }
  }
}

/* Upsert with the end-user specified score.
 */
template <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>
__global__ void upsert_kernel(const Table<K, V, S>* __restrict table,
                              Bucket<K, V, S>* buckets,
                              const size_t bucket_max_size,
                              const size_t buckets_num, const size_t dim,
                              const K* __restrict keys, V** __restrict vectors,
                              const S* __restrict scores,
                              int* __restrict src_offset, const S global_epoch,
                              size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int* buckets_size = table->buckets_size;

  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_pos = -1;
    size_t key_idx = t / TILE_SIZE;

    const K insert_key = keys[key_idx];
    if (IS_RESERVED_KEY<K>(insert_key)) continue;

    const S insert_score =
        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);

    size_t bkt_idx = 0;
    size_t start_idx = 0;
    int src_lane = -1;
    K evicted_key;

    Bucket<K, V, S>* bucket = get_key_position<K>(
        buckets, insert_key, bkt_idx, start_idx, buckets_num, bucket_max_size);

    if (src_offset != nullptr && g.thread_rank() == 0) {
      *(src_offset + key_idx) = key_idx;
    }

    OccupyResult occupy_result{OccupyResult::INITIAL};
    const int bucket_size = buckets_size[bkt_idx];
    do {
      if (bucket_size < bucket_max_size) {
        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(
            g, bucket, insert_key, insert_score, evicted_key, start_idx,
            key_pos, src_lane, bucket_max_size);
      } else {
        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(
            g, bucket, insert_key, insert_score, evicted_key, start_idx,
            key_pos, src_lane, bucket_max_size);
      }

      occupy_result = g.shfl(occupy_result, src_lane);
    } while (occupy_result == OccupyResult::CONTINUE);

    if (occupy_result == OccupyResult::REFUSED) continue;

    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
        g.thread_rank() == src_lane) {
      atomicAdd(&(buckets_size[bkt_idx]), 1);
    }

    if (g.thread_rank() == src_lane) {
      *(vectors + key_idx) = (bucket->vectors + key_pos * dim);
      ScoreFunctor::update(bucket, key_pos, scores, key_idx, insert_score,
                           (occupy_result != OccupyResult::DUPLICATE));
      bucket->digests(key_pos)[0] = get_digest<K>(insert_key);
      (bucket->keys(key_pos))
          ->store(insert_key, cuda::std::memory_order_relaxed);
    }
  }
}

/* Write the N data from src to each address in *dst,
   usually called by upsert kernel.

   `src`: A continuous memory pointer with Vector
          which can be HBM.
   `dst`: A pointer of pointer to V which should be on HBM,
          but each value (a pointer of V) could point to a
          memory on HBM or HMEM.
   `N`: Number of vectors that need to be written.
*/
template <class K, class V, class S>
__global__ void write_kernel(const V* __restrict src, V** __restrict dst,
                             const int* __restrict src_offset, const size_t dim,
                             const size_t N) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;

  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
    int vec_index = int(t / dim);
    int dim_index = t % dim;

    if (dst[vec_index] != nullptr) {
      if (src_offset != nullptr) {
        dst[vec_index][dim_index] =
            src[src_offset[vec_index] * dim + dim_index];
      } else {
        dst[vec_index][dim_index] = src[vec_index * dim + dim_index];
      }
    }
  }
}

/* Write the N data from src to each address in *dst by using CPU threads,
 * usually called by upsert kernel.
 *
 * @note: In some machines with AMD CPUs, the `write_kernel` has low performance
 * thru PCI-E, so we try to use the `memcpy` on CPU threads for writing work to
 * reach better performance.
 */
template <class V>
void write_by_cpu(V** __restrict dst, const V* __restrict src,
                  const int* __restrict offset, size_t dim, int N,
                  int n_worker = 16) {
  std::vector<std::thread> thds;
  if (n_worker < 1) n_worker = 1;

  auto functor = [dim](V** __restrict dst, const V* __restrict src,
                       const int* __restrict offset, int handled_size,
                       int trunk_size) -> void {
    for (int i = handled_size; i < handled_size + trunk_size; i++) {
      if (dst[i] != nullptr) {
        memcpy(dst[i], src + offset[i] * dim, sizeof(V) * dim);
      }
    }
  };

  int32_t trunk_size_floor = N / n_worker;
  int32_t trunk_size_remain = N % n_worker;
  int32_t n_worker_used = trunk_size_floor == 0 ? trunk_size_remain : n_worker;

  size_t handled_size = 0;
  for (int i = 0; i < n_worker_used; i++) {
    int32_t cur_trunk_size = trunk_size_floor;
    if (trunk_size_remain != 0) {
      cur_trunk_size += 1;
      trunk_size_remain--;
    }
    thds.push_back(
        std::thread(functor, dst, src, offset, handled_size, cur_trunk_size));
    handled_size += cur_trunk_size;
  }

  for (int i = 0; i < n_worker_used; i++) {
    thds[i].join();
  }
}

}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/core_kernels/upsert_and_evict.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "kernel_utils.cuh"

namespace nv {
namespace merlin {

// Use 1 thread to deal with a KV-pair, including copying value.
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          typename VecV = byte16, uint32_t BLOCK_SIZE = 128, int Strategy = -1>
__global__ void tlp_v1_upsert_and_evict_kernel_unique(
    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,
    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,
    const K* __restrict__ keys, const VecV* __restrict__ values,
    const S* __restrict__ scores, K* __restrict__ evicted_keys,
    VecV* __restrict__ evicted_values, S* __restrict__ evicted_scores,
    uint64_t n, uint64_t* __restrict__ evicted_counter, const S global_epoch) {
  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, 1>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  // bucket_capacity is a multiple of 4.
  constexpr uint32_t STRIDE_S = 4;
  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);
  __shared__ __align__(sizeof(byte16))
      S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  S score{static_cast<S>(EMPTY_SCORE)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  VecV* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  int32_t* bucket_size_ptr{nullptr};
  uint32_t key_pos = {0};
  uint32_t evict_idx{0};
  uint32_t bucket_size{0};
  if (kv_idx < n) {
    key = keys[kv_idx];
    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      bucket_size_ptr = buckets_size + bkt_idx;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_size = *bucket_size_ptr;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);
    } else {
      return;
    }
  } else {
    return;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::DUPLICATE;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(
            bucket_keys_ptr, key_pos, scores, kv_idx, score, bucket_capacity,
            get_digest<K>(key), (occupy_result != OccupyResult::DUPLICATE));
        break;
      } else if (bucket_size == bucket_capacity) {
        continue;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = static_cast<K>(EMPTY_KEY);
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::OCCUPIED_EMPTY;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(
            bucket_keys_ptr, key_pos, scores, kv_idx, score, bucket_capacity,
            get_digest<K>(key), (occupy_result != OccupyResult::DUPLICATE));
        atomicAdd(bucket_size_ptr, 1);
        break;
      }
    }
  }
  while (occupy_result == OccupyResult::INITIAL) {
    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);
    S min_score = MAX_SCORE;
    int min_pos = -1;
#pragma unroll
    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,
                              sizeof(S) * Load_LEN_S);
    }
    __pipeline_commit();
    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {
      if (i < bucket_capacity - STRIDE_S) {
#pragma unroll
        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
          __pipeline_memcpy_async(
              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,
              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);
        }
      }
      __pipeline_commit();
      __pipeline_wait_prior(1);
      S temp_scores[Load_LEN_S];
      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;
#pragma unroll
      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {
        *reinterpret_cast<byte16*>(temp_scores) =
            *reinterpret_cast<byte16*>(src + k);
#pragma unroll
        for (int j = 0; j < Load_LEN_S; j += 1) {
          S temp_score = temp_scores[j];
          if (temp_score < min_score) {
            min_score = temp_score;
            min_pos = i + k + j;
          }
        }
      }
    }
    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (score < min_score) {
      occupy_result = OccupyResult::REFUSED;
      evict_idx = atomicAdd(evicted_counter, 1);
      evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx, key,
                            score);
      break;
    }
    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);
    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);
    if (expected_key != static_cast<K>(LOCKED_KEY) &&
        expected_key != static_cast<K>(EMPTY_KEY)) {
      bool result = min_score_key->compare_exchange_strong(
          expected_key, static_cast<K>(LOCKED_KEY),
          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      if (result) {
        S* min_score_ptr =
            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);
        auto verify_score_ptr =
            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);
        auto verify_score =
            verify_score_ptr->load(cuda::std::memory_order_relaxed);
        if (verify_score <= min_score) {
          key_pos = min_pos;
          ScoreFunctor::update_with_digest(
              bucket_keys_ptr, key_pos, scores, kv_idx, score, bucket_capacity,
              get_digest<K>(key), (occupy_result != OccupyResult::DUPLICATE));

          if (expected_key == static_cast<K>(RECLAIM_KEY)) {
            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
            atomicAdd(bucket_size_ptr, 1);
          } else {
            occupy_result = OccupyResult::EVICT;
            evict_idx = atomicAdd(evicted_counter, 1);
            evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx,
                                  expected_key, min_score);
          }
        } else {
          min_score_key->store(expected_key, cuda::std::memory_order_release);
        }
      }
    }
  }
  VecV* bucket_value_ptr = bucket_values_ptr + key_pos * dim;
  const VecV* param_value_ptr = values + kv_idx * dim;
  VecV* evicted_value_ptr = evicted_values + evict_idx * dim;

  if (occupy_result != OccupyResult::REFUSED) {
    if (occupy_result == OccupyResult::EVICT) {
      CopyValue::ldg_stg(0, evicted_value_ptr, bucket_value_ptr, dim);
    }
    CopyValue::ldg_stg(0, bucket_value_ptr, param_value_ptr, dim);
    auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
    // memory_order_release:
    // Modifications to the bucket will not after this instruction.
    key_address->store(key, cuda::std::memory_order_release);
  } else {
    CopyValue::ldg_stg(0, evicted_value_ptr, param_value_ptr, dim);
  }
}

// Use 1 thread to deal with a KV-pair, but use a threads group cto copy value.
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          typename VecV = byte16, uint32_t BLOCK_SIZE = 128,
          uint32_t GROUP_SIZE = 16, int Strategy = -1>
__global__ void tlp_v2_upsert_and_evict_kernel_unique(
    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,
    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,
    const K* __restrict__ keys, const VecV* __restrict__ values,
    const S* __restrict__ scores, K* __restrict__ evicted_keys,
    VecV* __restrict__ evicted_values, S* __restrict__ evicted_scores,
    uint64_t n, uint64_t* __restrict__ evicted_counter, const S global_epoch) {
  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  // bucket_capacity is a multiple of 4.
  constexpr uint32_t STRIDE_S = 4;
  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);
  __shared__ __align__(sizeof(byte16))
      S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];

  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  S score{static_cast<S>(EMPTY_SCORE)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  VecV* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  int32_t* bucket_size_ptr{nullptr};
  uint32_t key_pos = {0};
  uint32_t evict_idx{0};
  uint32_t bucket_size{0};
  if (kv_idx < n) {
    key = keys[kv_idx];
    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      bucket_size_ptr = buckets_size + bkt_idx;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_size = *bucket_size_ptr;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);
    } else {
      occupy_result = OccupyResult::ILLEGAL;
    }
  } else {
    occupy_result = OccupyResult::ILLEGAL;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::DUPLICATE;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), false);
        break;
      } else if (bucket_size == bucket_capacity) {
        continue;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = static_cast<K>(EMPTY_KEY);
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::OCCUPIED_EMPTY;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), true);
        atomicAdd(bucket_size_ptr, 1);
        break;
      }
    }
  }
  while (occupy_result == OccupyResult::INITIAL) {
    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);
    S min_score = static_cast<S>(MAX_SCORE);
    int min_pos = -1;
#pragma unroll
    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,
                              sizeof(S) * Load_LEN_S);
    }
    __pipeline_commit();
    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {
      if (i < bucket_capacity - STRIDE_S) {
#pragma unroll
        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
          __pipeline_memcpy_async(
              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,
              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);
        }
      }
      __pipeline_commit();
      __pipeline_wait_prior(1);
      S temp_scores[Load_LEN_S];
      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;
#pragma unroll
      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {
        *reinterpret_cast<byte16*>(temp_scores) =
            *reinterpret_cast<byte16*>(src + k);
#pragma unroll
        for (int j = 0; j < Load_LEN_S; j += 1) {
          S temp_score = temp_scores[j];
          if (temp_score < min_score) {
            auto verify_key_ptr = BUCKET::keys(bucket_keys_ptr, i + k + j);
            auto verify_key =
                verify_key_ptr->load(cuda::std::memory_order_relaxed);
            if (verify_key != static_cast<K>(LOCKED_KEY) &&
                verify_key != static_cast<K>(EMPTY_KEY)) {
              min_score = temp_score;
              min_pos = i + k + j;
            }
          }
        }
      }
    }
    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (score < min_score) {
      occupy_result = OccupyResult::REFUSED;
      evict_idx = atomicAdd(evicted_counter, 1);
      evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx, key,
                            score);
      break;
    }
    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);
    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);
    if (expected_key != static_cast<K>(LOCKED_KEY) &&
        expected_key != static_cast<K>(EMPTY_KEY)) {
      bool result = min_score_key->compare_exchange_strong(
          expected_key, static_cast<K>(LOCKED_KEY),
          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      if (result) {
        S* min_score_ptr =
            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);
        auto verify_score_ptr =
            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);
        auto verify_score =
            verify_score_ptr->load(cuda::std::memory_order_relaxed);
        if (verify_score <= min_score) {
          key_pos = min_pos;
          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                           kv_idx, score, bucket_capacity,
                                           get_digest<K>(key), true);
          if (expected_key == static_cast<K>(RECLAIM_KEY)) {
            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
            atomicAdd(bucket_size_ptr, 1);
          } else {
            occupy_result = OccupyResult::EVICT;
            evict_idx = atomicAdd(evicted_counter, 1);
            evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx,
                                  expected_key, min_score);
          }
        } else {
          min_score_key->store(expected_key, cuda::std::memory_order_release);
        }
      }
    }
  }
  VecV* bucket_value_ptr{nullptr};
  if (occupy_result != OccupyResult::ILLEGAL) {
    bucket_value_ptr = bucket_values_ptr + key_pos * dim;
  }
  uint32_t rank = g.thread_rank();
  uint32_t groupID = threadIdx.x / GROUP_SIZE;

  // Shared memory reuse:
  // __shared__ S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];
  // __shared__ VecV sm_values_buffer[GROUP_NUM][2][GROUP_BUF];
  // assert(GROUP_BUF >= 2 * dim);
  constexpr uint32_t GROUP_BUFs =
      GROUP_SIZE * 2 * STRIDE_S * sizeof(S) / sizeof(VecV);
  constexpr uint32_t GROUP_BUF = GROUP_BUFs / 2;
  auto sm_values_buffer =
      reinterpret_cast<VecV*>(&(sm_bucket_scores[0][0])) + groupID * GROUP_BUFs;

  auto occupy_result_next = g.shfl(occupy_result, 0);
  if (occupy_result_next != OccupyResult::ILLEGAL) {
    auto kv_idx_next = g.shfl(kv_idx, 0);
    const VecV* src = values + kv_idx_next * dim;
    VecV* dst = sm_values_buffer;
    CopyValue::ldg_sts(rank, dst, src, dim);

    if (occupy_result_next == OccupyResult::EVICT) {
      const VecV* src = g.shfl(bucket_value_ptr, 0);
      dst = dst + dim;
      CopyValue::ldg_sts(rank, dst, src, dim);
    }
  }
  __pipeline_commit();

  for (int i = 0; i < GROUP_SIZE; i++) {
    if (i + 1 < GROUP_SIZE) {
      auto occupy_result_next = g.shfl(occupy_result, i + 1);
      if (occupy_result_next != OccupyResult::ILLEGAL) {
        auto kv_idx_next = g.shfl(kv_idx, i + 1);
        const VecV* src = values + kv_idx_next * dim;
        VecV* dst = sm_values_buffer + diff_buf(i) * GROUP_BUF;
        CopyValue::ldg_sts(rank, dst, src, dim);

        if (occupy_result_next == OccupyResult::EVICT) {
          const VecV* src = g.shfl(bucket_value_ptr, i + 1);
          dst = dst + dim;
          CopyValue::ldg_sts(rank, dst, src, dim);
        }
      }
    }
    __pipeline_commit();
    auto occupy_result_cur = g.shfl(occupy_result, i);
    if (occupy_result_cur != OccupyResult::ILLEGAL) {
      auto evict_idx_cur = g.shfl(evict_idx, i);

      VecV* src = sm_values_buffer + same_buf(i) * GROUP_BUF;
      if (occupy_result_cur != OccupyResult::REFUSED) {
        VecV* dst = g.shfl(bucket_value_ptr, i);
        __pipeline_wait_prior(1);
        CopyValue::lds_stg(rank, dst, src, dim);
        if (rank == i) {
          auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
          // memory_order_release:
          // Modifications to the bucket will not after this instruction.
          key_address->store(key, cuda::std::memory_order_release);
        }
        if (occupy_result_cur == OccupyResult::EVICT) {
          src = src + dim;
          VecV* dst = evicted_values + evict_idx_cur * dim;
          CopyValue::lds_stg(rank, dst, src, dim);
        }
      } else {
        VecV* dst = evicted_values + evict_idx_cur * dim;
        __pipeline_wait_prior(1);
        CopyValue::lds_stg(rank, dst, src, dim);
      }
    }
  }
}

template <typename K, typename V, typename S, typename VecV,
          uint32_t BLOCK_SIZE, uint32_t GROUP_SIZE, uint32_t BUCKET_SIZE,
          uint32_t GROUP_NUM = BLOCK_SIZE / GROUP_SIZE,
          uint32_t offset_param_scores = 0,
          uint32_t offset_bucket_values_ptr =
              offset_param_scores + sizeof(S) * BLOCK_SIZE,
          uint32_t offset_buckets_size_ptr =
              offset_bucket_values_ptr + sizeof(VecV*) * BLOCK_SIZE,
          uint32_t offset_bucket_digests =
              offset_buckets_size_ptr + sizeof(int*) * BLOCK_SIZE,
          uint32_t offset_bucket_scores =
              offset_bucket_digests + sizeof(D) * GROUP_NUM * 2 * BUCKET_SIZE,
          uint32_t offset_values_buffer =
              offset_bucket_scores + sizeof(S) * GROUP_NUM * 2 * BUCKET_SIZE>
struct SharedMemoryManager_Pipeline_UpsertAndEvict {
  // __shared__ S sm_param_scores[BLOCK_SIZE];
  // __shared__ VecV* sm_bucket_values_ptr[BLOCK_SIZE];
  // __shared__ int* sm_buckets_size_ptr[BLOCK_SIZE];
  // __shared__ D sm_bucket_digests[GROUP_NUM][2][BUCKET_SIZE];
  // __shared__ S sm_bucket_scores[GROUP_NUM][2][BUCKET_SIZE];
  // __shared__ VecV sm_values_buffer[GROUP_NUM][2][dim * 2];

  static inline uint32_t total_size(uint32_t dim) {
    return BLOCK_SIZE * (sizeof(S) + sizeof(VecV*) + sizeof(int*)) +
           GROUP_NUM * 2 *
               (BUCKET_SIZE * (sizeof(D) + sizeof(S)) + 2 * dim * sizeof(VecV));
  }
  static __forceinline__ __device__ S* param_scores(byte* smem) {
    return reinterpret_cast<S*>(smem + offset_param_scores);
  }
  static __forceinline__ __device__ VecV** bucket_values_ptr(byte* smem) {
    return reinterpret_cast<VecV**>(smem + offset_bucket_values_ptr);
  }
  static __forceinline__ __device__ int** buckets_size_ptr(byte* smem) {
    return reinterpret_cast<int**>(smem + offset_buckets_size_ptr);
  }
  static __forceinline__ __device__ D* bucket_digests(byte* smem,
                                                      uint32_t groupID,
                                                      uint32_t buf) {
    return reinterpret_cast<D*>(smem + offset_bucket_digests) +
           BUCKET_SIZE * (groupID * 2 + buf);
  }
  static __forceinline__ __device__ S* bucket_scores(byte* smem,
                                                     uint32_t groupID,
                                                     uint32_t buf) {
    return reinterpret_cast<S*>(smem + offset_bucket_scores) +
           BUCKET_SIZE * (groupID * 2 + buf);
  }
  static __forceinline__ __device__ VecV* values_buffer(byte* smem,
                                                        uint32_t groupID,
                                                        uint32_t buf,
                                                        uint32_t dim) {
    return reinterpret_cast<VecV*>(smem + offset_values_buffer) +
           2 * dim * (groupID * 2 + buf);
  }
};

template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          typename VecV = byte16, uint32_t BLOCK_SIZE = 128, int Strategy = -1>
__global__ void pipeline_upsert_and_evict_kernel_unique(
    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,
    const uint64_t buckets_num, const uint32_t dim, const K* __restrict__ keys,
    const VecV* __restrict__ values, const S* __restrict__ scores,
    K* __restrict__ evicted_keys, VecV* __restrict__ evicted_values,
    S* __restrict__ evicted_scores, uint64_t n,
    uint64_t* __restrict__ evicted_counter, const S global_epoch) {
  // Here, GROUP_SIZE * Comp_LEN = BUCKET_SIZE.
  constexpr uint32_t BUCKET_SIZE = 128;
  constexpr uint32_t GROUP_SIZE = 32;
  constexpr uint32_t Comp_LEN = sizeof(VecD_Comp) / sizeof(D);
  constexpr uint32_t Load_LEN = sizeof(VecD_Load) / sizeof(D);
  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);

  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;
  using SMM =
      SharedMemoryManager_Pipeline_UpsertAndEvict<K, V, S, VecV, BLOCK_SIZE,
                                                  GROUP_SIZE, BUCKET_SIZE>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  extern __shared__ __align__(sizeof(byte16)) byte smem[];

  // Initialization.
  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());
  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  VecD_Comp target_digests;
  K* bucket_keys_ptr{nullptr};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  uint32_t key_pos = 0;
  uint32_t evict_idx = 0;
  if (kv_idx < n) {
    key = keys[kv_idx];
    if (scores != nullptr) {
      S* sm_param_scores = SMM::param_scores(smem);
      __pipeline_memcpy_async(sm_param_scores + tx, scores + kv_idx, sizeof(S));
    }
    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * BUCKET_SIZE));
      uint64_t bkt_idx = global_idx / BUCKET_SIZE;
      key_pos = get_start_position(global_idx, BUCKET_SIZE);
      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
      sm_buckets_size_ptr[tx] = buckets_size + bkt_idx;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
      __pipeline_memcpy_async(sm_bucket_values_ptr + tx, &(bucket->vectors),
                              sizeof(VecV*));
    } else {
      occupy_result = OccupyResult::ILLEGAL;
    }
  } else {
    occupy_result = OccupyResult::ILLEGAL;
  }

  uint32_t rank = g.thread_rank();
  uint32_t groupID = threadIdx.x / GROUP_SIZE;

  // Pipeline loading.
  auto occupy_result_next = g.shfl(occupy_result, 0);
  auto keys_ptr_next = g.shfl(bucket_keys_ptr, 0);
  if (occupy_result_next == OccupyResult::INITIAL) {
    D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, 0);
    D* dst = sm_bucket_digests + rank * Load_LEN;
    D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);
    if (rank * Load_LEN < BUCKET_SIZE) {
      __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));
    }
  }
  __pipeline_commit();
  // Padding, meet the param of the first `__pipeline_wait_prior`
  // in the first loop.
  __pipeline_commit();
  __pipeline_commit();
  for (int32_t i = 0; i < GROUP_SIZE; i++) {
    // Step1: load digests from global memory to shared memory.
    if (i + 1 < GROUP_SIZE) {
      auto occupy_result_next = g.shfl(occupy_result, i + 1);
      auto keys_ptr_next = g.shfl(bucket_keys_ptr, i + 1);
      if (occupy_result_next == OccupyResult::INITIAL) {
        D* sm_bucket_digests = SMM::bucket_digests(smem, groupID, diff_buf(i));
        D* dst = sm_bucket_digests + rank * Load_LEN;
        D* src = BUCKET::digests(keys_ptr_next, BUCKET_SIZE, rank * Load_LEN);
        if (rank * Load_LEN < BUCKET_SIZE) {
          __pipeline_memcpy_async(dst, src, sizeof(VecD_Load));
        }
      }
    }
    __pipeline_commit();
    // Step2: to lock the target_key or empty_key by querying digests.
    auto occupy_result_cur = g.shfl(occupy_result, i);
    if (occupy_result_cur == OccupyResult::INITIAL) {
      uint32_t tx_cur = groupID * GROUP_SIZE + i;
      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
      auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];
      K key_cur = g.shfl(key, i);
      auto target_digests_cur = g.shfl(target_digests, i);
      auto start_pos_cur = g.shfl(key_pos, i);
      auto keys_ptr_cur = g.shfl(bucket_keys_ptr, i);
      auto bucket_size_cur = bucket_size_ptr[0];
      __pipeline_wait_prior(3);
      D* src = SMM::bucket_digests(smem, groupID, same_buf(i));
      uint32_t start_offset = start_pos_cur / Comp_LEN;
      uint32_t probe_offset =
          Comp_LEN * ((start_offset + rank) & (GROUP_SIZE - 1));
      VecD_Comp probe_digests =
          *reinterpret_cast<VecD_Comp*>(src + probe_offset);
      uint32_t cmp_result = __vcmpeq4(probe_digests, target_digests_cur);
      cmp_result &= 0x01010101;
      uint32_t possible_pos = 0;
      bool result = false;
      do {
        if (cmp_result == 0) break;
        int32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = probe_offset + index;
        auto current_key = BUCKET::keys(keys_ptr_cur, possible_pos);
        K expected_key = key_cur;
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      uint32_t found_vote = g.ballot(result);
      if (found_vote) {
        int32_t src_lane = __ffs(found_vote) - 1;
        possible_pos = g.shfl(possible_pos, src_lane);
        if (rank == i) {
          occupy_result = OccupyResult::DUPLICATE;
          S* sm_param_scores = SMM::param_scores(smem);
          key_pos = possible_pos;
          S score = ScoreFunctor::desired_when_missed(sm_param_scores, tx,
                                                      global_epoch);
          ScoreFunctor::update_with_digest(
              bucket_keys_ptr, key_pos, sm_param_scores, tx, score, BUCKET_SIZE,
              get_digest<K>(key), false);
        }
      } else if (bucket_size_cur < BUCKET_SIZE) {
        VecD_Comp empty_digests_ = empty_digests<K>();
        cmp_result = __vcmpeq4(probe_digests, empty_digests_);
        cmp_result &= 0x01010101;
        for (int32_t offset = 0; offset < GROUP_SIZE; offset += 1) {
          if (rank == offset) {
            do {
              if (cmp_result == 0) break;
              int32_t index = (__ffs(cmp_result) - 1) >> 3;
              cmp_result &= (cmp_result - 1);
              possible_pos = probe_offset + index;
              if (offset == 0 && possible_pos < start_pos_cur) continue;
              auto current_key = BUCKET::keys(keys_ptr_cur, possible_pos);
              K expected_key = static_cast<K>(EMPTY_KEY);
              result = current_key->compare_exchange_strong(
                  expected_key, static_cast<K>(LOCKED_KEY),
                  cuda::std::memory_order_acquire,
                  cuda::std::memory_order_relaxed);
            } while (!result);
          }
          uint32_t found_vote = g.ballot(result);
          if (found_vote) {
            int32_t src_lane = __ffs(found_vote) - 1;
            possible_pos = g.shfl(possible_pos, src_lane);
            if (rank == i) {
              occupy_result = OccupyResult::OCCUPIED_EMPTY;
              S* sm_param_scores = SMM::param_scores(smem);
              S score = ScoreFunctor::desired_when_missed(sm_param_scores, tx,
                                                          global_epoch);
              int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
              int* bucket_size_ptr = sm_buckets_size_ptr[tx];
              key_pos = possible_pos;
              ScoreFunctor::update_with_digest(
                  bucket_keys_ptr, key_pos, sm_param_scores, tx, score,
                  BUCKET_SIZE, get_digest<K>(key), true);
              atomicAdd(bucket_size_ptr, 1);
            }
            break;
          }
        }
      }
      occupy_result_cur = g.shfl(occupy_result, i);
      if (occupy_result_cur == OccupyResult::INITIAL) {
        S* sm_bucket_scores = SMM::bucket_scores(smem, groupID, same_buf(i));
        S* dst = sm_bucket_scores + rank * Load_LEN_S;
        S* src = BUCKET::scores(keys_ptr_cur, BUCKET_SIZE, rank * Load_LEN_S);
#pragma unroll
        for (int32_t k = 0; k < BUCKET_SIZE; k += GROUP_SIZE * Load_LEN_S) {
          __pipeline_memcpy_async(dst + k, src + k, sizeof(S) * Load_LEN_S);
        }
      }
    }
    __pipeline_commit();
    // Step 3: reduce to get the key with the minimum score.
    if (i > 0) {
      occupy_result_cur = g.shfl(occupy_result, i - 1);
      uint32_t tx_cur = groupID * GROUP_SIZE + i - 1;
      S* sm_param_scores = SMM::param_scores(smem);
      S score_cur = ScoreFunctor::desired_when_missed(sm_param_scores, tx_cur,
                                                      global_epoch);
      int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
      auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];
      __pipeline_wait_prior(3);
      S* src = SMM::bucket_scores(smem, groupID, diff_buf(i));
      while (occupy_result_cur == OccupyResult::INITIAL) {
        int min_pos_local = -1;
        S min_score_local = MAX_SCORE;
#pragma unroll
        for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {
          S temp_scores[Load_LEN_S];
          *reinterpret_cast<byte16*>(temp_scores) =
              *reinterpret_cast<byte16*>(src + rank * Load_LEN_S + j);
#pragma unroll
          for (int k = 0; k < Load_LEN_S; k++) {
            S temp_score = temp_scores[k];
            if (temp_score < min_score_local) {
              min_score_local = temp_score;
              min_pos_local = rank * Load_LEN_S + j + k;
            }
          }
        }
        const S min_score_global =
            cg::reduce(g, min_score_local, cg::less<S>());
        if (score_cur < min_score_global) {
          if (rank == i - 1) {
            occupy_result = OccupyResult::REFUSED;
            evict_idx = atomicAdd(evicted_counter, 1);
            evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx, key,
                                  score_cur);
          }
          occupy_result_cur = g.shfl(occupy_result, i - 1);
          break;
        }
        uint32_t vote = g.ballot(min_score_local <= min_score_global);
        if (vote) {
          int src_lane = __ffs(vote) - 1;
          int min_pos_global = g.shfl(min_pos_local, src_lane);
          if (rank == i - 1) {
            src[min_pos_global] = static_cast<S>(MAX_SCORE);  // Mark visited.
            auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos_global);
            auto expected_key =
                min_score_key->load(cuda::std::memory_order_relaxed);
            if (expected_key != static_cast<K>(LOCKED_KEY) &&
                expected_key != static_cast<K>(EMPTY_KEY)) {
              bool result = min_score_key->compare_exchange_strong(
                  expected_key, static_cast<K>(LOCKED_KEY),
                  cuda::std::memory_order_acquire,
                  cuda::std::memory_order_relaxed);
              if (result) {
                S* score_ptr = BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE,
                                              min_pos_global);
                auto verify_score_ptr =
                    reinterpret_cast<AtomicScore<S>*>(score_ptr);
                auto verify_score =
                    verify_score_ptr->load(cuda::std::memory_order_relaxed);
                if (verify_score <= min_score_global) {
                  if (expected_key == static_cast<K>(RECLAIM_KEY)) {
                    occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
                    atomicAdd(bucket_size_ptr, 1);
                  } else {
                    occupy_result = OccupyResult::EVICT;
                    evict_idx = atomicAdd(evicted_counter, 1);
                    evict_key_score<K, S>(evicted_keys, evicted_scores,
                                          evict_idx, expected_key,
                                          min_score_global);
                  }
                  key_pos = min_pos_global;
                  ScoreFunctor::update_with_digest(
                      bucket_keys_ptr, key_pos, sm_param_scores, tx_cur,
                      score_cur, BUCKET_SIZE, get_digest<K>(key), true);

                } else {
                  min_score_key->store(expected_key,
                                       cuda::std::memory_order_release);
                }
              }
            }
          }
          occupy_result_cur = g.shfl(occupy_result, i - 1);
        }
      }
      // Prefetch values to shared memory.
      if (occupy_result_cur != OccupyResult::ILLEGAL) {
        auto kv_idx_cur = g.shfl(kv_idx, i - 1);
        const VecV* src = values + kv_idx_cur * dim;
        VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(i), dim);
        CopyValue::ldg_sts(rank, dst, src, dim);

        if (occupy_result_cur == OccupyResult::EVICT) {
          VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
          auto bucket_values_ptr =
              sm_bucket_values_ptr[groupID * GROUP_SIZE + i - 1];
          auto key_pos_cur = g.shfl(key_pos, i - 1);
          const VecV* src = bucket_values_ptr + key_pos_cur * dim;
          dst = dst + dim;
          CopyValue::ldg_sts(rank, dst, src, dim);
        }
      }
    }
    __pipeline_commit();

    // Step 4: write values to bucket and evicted buffer.
    if (i > 1) {
      occupy_result_cur = g.shfl(occupy_result, i - 2);
      if (occupy_result_cur != OccupyResult::ILLEGAL) {
        VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
        auto bucket_values_ptr =
            sm_bucket_values_ptr[groupID * GROUP_SIZE + i - 2];
        auto key_pos_cur = g.shfl(key_pos, i - 2);
        auto evict_idx_cur = g.shfl(evict_idx, i - 2);

        VecV* src = SMM::values_buffer(smem, groupID, same_buf(i), dim);
        if (occupy_result_cur == OccupyResult::REFUSED) {
          VecV* dst = evicted_values + evict_idx_cur * dim;
          __pipeline_wait_prior(3);
          CopyValue::lds_stg(rank, dst, src, dim);
        } else {
          VecV* dst = bucket_values_ptr + key_pos_cur * dim;
          __pipeline_wait_prior(3);
          CopyValue::lds_stg(rank, dst, src, dim);
          if (rank == i - 2) {
            auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
            key_address->store(key, cuda::std::memory_order_release);
          }
          if (occupy_result_cur == OccupyResult::EVICT) {
            src = src + dim;
            VecV* dst = evicted_values + evict_idx_cur * dim;
            __pipeline_wait_prior(3);
            CopyValue::lds_stg(rank, dst, src, dim);
          }
        }
      }
    }
  }
  auto occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);
  uint32_t tx_cur = groupID * GROUP_SIZE + GROUP_SIZE - 1;
  S* sm_param_scores = SMM::param_scores(smem);
  S score_cur =
      ScoreFunctor::desired_when_missed(sm_param_scores, tx_cur, global_epoch);
  int** sm_buckets_size_ptr = SMM::buckets_size_ptr(smem);
  auto bucket_size_ptr = sm_buckets_size_ptr[tx_cur];
  __pipeline_wait_prior(1);
  S* src = SMM::bucket_scores(smem, groupID, diff_buf(GROUP_SIZE));
  while (occupy_result_cur == OccupyResult::INITIAL) {
    int min_pos_local = -1;
    S min_score_local = MAX_SCORE;
#pragma unroll
    for (int j = 0; j < BUCKET_SIZE; j += GROUP_SIZE * Load_LEN_S) {
      S temp_scores[Load_LEN_S];
      *reinterpret_cast<byte16*>(temp_scores) =
          *reinterpret_cast<byte16*>(src + rank * Load_LEN_S + j);
#pragma unroll
      for (int k = 0; k < Load_LEN_S; k++) {
        S temp_score = temp_scores[k];
        if (temp_score < min_score_local) {
          min_score_local = temp_score;
          min_pos_local = rank * Load_LEN_S + j + k;
        }
      }
    }
    const S min_score_global = cg::reduce(g, min_score_local, cg::less<S>());
    if (score_cur < min_score_global) {
      if (rank == GROUP_SIZE - 1) {
        occupy_result = OccupyResult::REFUSED;
        evict_idx = atomicAdd(evicted_counter, 1);
        evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx, key,
                              score_cur);
      }
      occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);
      break;
    }
    uint32_t vote = g.ballot(min_score_local <= min_score_global);
    if (vote) {
      int src_lane = __ffs(vote) - 1;
      int min_pos_global = g.shfl(min_pos_local, src_lane);
      if (rank == GROUP_SIZE - 1) {
        src[min_pos_global] = MAX_SCORE;  // Mark visited.
        auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos_global);
        auto expected_key =
            min_score_key->load(cuda::std::memory_order_acquire);
        if (expected_key != static_cast<K>(LOCKED_KEY) &&
            expected_key != static_cast<K>(EMPTY_KEY)) {
          auto min_score_ptr =
              BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE, min_pos_global);
          bool result = min_score_key->compare_exchange_strong(
              expected_key, static_cast<K>(LOCKED_KEY),
              cuda::std::memory_order_acquire, cuda::std::memory_order_acquire);
          if (result) {
            S* score_ptr =
                BUCKET::scores(bucket_keys_ptr, BUCKET_SIZE, min_pos_global);
            auto verify_score_ptr =
                reinterpret_cast<AtomicScore<S>*>(score_ptr);
            auto verify_score =
                verify_score_ptr->load(cuda::std::memory_order_relaxed);
            if (verify_score <= min_score_global) {
              if (expected_key == static_cast<K>(RECLAIM_KEY)) {
                atomicAdd(bucket_size_ptr, 1);
                occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
              } else {
                occupy_result = OccupyResult::EVICT;
                evict_idx = atomicAdd(evicted_counter, 1);
                evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx,
                                      expected_key, min_score_global);
              }
              key_pos = min_pos_global;
              ScoreFunctor::update_with_digest(
                  bucket_keys_ptr, key_pos, sm_param_scores, tx_cur, score_cur,
                  BUCKET_SIZE, get_digest<K>(key), true);
            } else {
              min_score_key->store(expected_key,
                                   cuda::std::memory_order_release);
            }
          }
        }
      }
      occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);
    }
  }
  if (occupy_result_cur != OccupyResult::ILLEGAL) {
    auto kv_idx_cur = g.shfl(kv_idx, GROUP_SIZE - 1);
    VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
    auto bucket_values_ptr =
        sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 1];
    auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 1);

    const VecV* src = values + kv_idx_cur * dim;
    VecV* dst = SMM::values_buffer(smem, groupID, diff_buf(GROUP_SIZE), dim);
    CopyValue::ldg_sts(rank, dst, src, dim);

    if (occupy_result_cur == OccupyResult::EVICT) {
      const VecV* src = bucket_values_ptr + key_pos_cur * dim;
      dst = dst + dim;
      CopyValue::ldg_sts(rank, dst, src, dim);
    }
  }
  __pipeline_commit();

  occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 2);
  if (occupy_result_cur != OccupyResult::ILLEGAL) {
    VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
    auto bucket_values_ptr =
        sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 2];
    auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 2);
    auto evict_idx_cur = g.shfl(evict_idx, GROUP_SIZE - 2);

    VecV* src = SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE), dim);
    if (occupy_result_cur == OccupyResult::REFUSED) {
      VecV* dst = evicted_values + evict_idx_cur * dim;
      __pipeline_wait_prior(1);
      CopyValue::lds_stg(rank, dst, src, dim);
    } else {
      VecV* dst = bucket_values_ptr + key_pos_cur * dim;
      __pipeline_wait_prior(1);
      CopyValue::lds_stg(rank, dst, src, dim);
      if (rank == GROUP_SIZE - 2) {
        auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
        key_address->store(key, cuda::std::memory_order_release);
      }
      if (occupy_result_cur == OccupyResult::EVICT) {
        src = src + dim;
        VecV* dst = evicted_values + evict_idx_cur * dim;
        __pipeline_wait_prior(1);
        CopyValue::lds_stg(rank, dst, src, dim);
      }
    }
  }

  occupy_result_cur = g.shfl(occupy_result, GROUP_SIZE - 1);
  if (occupy_result_cur != OccupyResult::ILLEGAL) {
    VecV** sm_bucket_values_ptr = SMM::bucket_values_ptr(smem);
    auto bucket_values_ptr =
        sm_bucket_values_ptr[groupID * GROUP_SIZE + GROUP_SIZE - 1];
    auto evict_idx_cur = g.shfl(evict_idx, GROUP_SIZE - 1);
    auto key_pos_cur = g.shfl(key_pos, GROUP_SIZE - 1);

    VecV* src =
        SMM::values_buffer(smem, groupID, same_buf(GROUP_SIZE + 1), dim);
    if (occupy_result_cur == OccupyResult::REFUSED) {
      VecV* dst = evicted_values + evict_idx_cur * dim;
      __pipeline_wait_prior(0);
      CopyValue::lds_stg(rank, dst, src, dim);
    } else {
      VecV* dst = bucket_values_ptr + key_pos_cur * dim;
      __pipeline_wait_prior(0);
      CopyValue::lds_stg(rank, dst, src, dim);
      if (rank == GROUP_SIZE - 1) {
        auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
        key_address->store(key, cuda::std::memory_order_release);
      }
      if (occupy_result_cur == OccupyResult::EVICT) {
        src = src + dim;
        VecV* dst = evicted_values + evict_idx_cur * dim;
        __pipeline_wait_prior(0);
        CopyValue::lds_stg(rank, dst, src, dim);
      }
    }
  }
}

template <typename K = uint64_t, typename V = float, typename S = uint64_t>
struct Params_UpsertAndEvict {
  Params_UpsertAndEvict(
      float load_factor_, Bucket<K, V, S>* __restrict__ buckets_,
      int* buckets_size_, size_t buckets_num_, uint32_t bucket_capacity_,
      uint32_t dim_, const K* __restrict__ keys_, const V* __restrict__ values_,
      const S* __restrict__ scores_, K* __restrict__ evicted_keys_,
      V* __restrict__ evicted_values_, S* __restrict__ evicted_scores_,
      size_t n_, size_t* evicted_counter_, const S global_epoch_)
      : load_factor(load_factor_),
        buckets(buckets_),
        buckets_size(buckets_size_),
        buckets_num(buckets_num_),
        bucket_capacity(bucket_capacity_),
        dim(dim_),
        keys(keys_),
        values(values_),
        scores(scores_),
        evicted_keys(evicted_keys_),
        evicted_values(evicted_values_),
        evicted_scores(evicted_scores_),
        n(n_),
        evicted_counter(evicted_counter_),
        global_epoch(global_epoch_) {}
  float load_factor;
  Bucket<K, V, S>* __restrict__ buckets;
  int* buckets_size;
  size_t buckets_num;
  uint32_t bucket_capacity;
  uint32_t dim;
  const K* __restrict__ keys;
  const V* __restrict__ values;
  const S* __restrict__ scores;
  K* __restrict__ evicted_keys;
  V* __restrict__ evicted_values;
  S* __restrict__ evicted_scores;
  uint64_t n;
  uint64_t* evicted_counter;
  const S global_epoch;
};

// Use 1 thread to deal with a KV-pair, but use a threads group to copy value.
template <typename K = uint64_t, typename V = byte4, typename S = uint64_t,
          typename VecV = byte16, uint32_t BLOCK_SIZE = 128,
          uint32_t GROUP_SIZE = 32, int Strategy = -1>
__global__ void insert_and_evict_kernel_with_filter(
    Bucket<K, V, S>* __restrict__ buckets, int32_t* __restrict__ buckets_size,
    const uint64_t buckets_num, uint32_t bucket_capacity, const uint32_t dim,
    const K* __restrict__ keys, const VecV* __restrict__ values,
    const S* __restrict__ scores, K* __restrict__ evicted_keys,
    VecV* __restrict__ evicted_values, S* __restrict__ evicted_scores,
    uint64_t n, uint64_t* __restrict__ evicted_counter, const S global_epoch) {
  using BUCKET = Bucket<K, V, S>;
  using CopyValue = CopyValueMultipleGroup<VecV, GROUP_SIZE>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  // bucket_capacity is a multiple of 4.
  constexpr uint32_t STRIDE_S = 4;
  constexpr uint32_t Load_LEN_S = sizeof(byte16) / sizeof(S);
  __shared__ __align__(sizeof(byte16))
      S sm_bucket_scores[BLOCK_SIZE][2 * STRIDE_S];

  auto g = cg::tiled_partition<GROUP_SIZE>(cg::this_thread_block());

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  S score{static_cast<S>(EMPTY_SCORE)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  VecV* bucket_values_ptr{nullptr};
  K* bucket_keys_ptr{nullptr};
  int32_t* bucket_size_ptr{nullptr};
  uint32_t key_pos = {0};
  uint32_t evict_idx{0};
  uint32_t bucket_size{0};
  if (kv_idx < n) {
    key = keys[kv_idx];
    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      bucket_size_ptr = buckets_size + bkt_idx;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_size = *bucket_size_ptr;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
      bucket_values_ptr = reinterpret_cast<VecV*>(bucket->vectors);
    } else {
      occupy_result = OccupyResult::ILLEGAL;
    }
  } else {
    occupy_result = OccupyResult::ILLEGAL;
  }

  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);
  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    if (occupy_result != OccupyResult::INITIAL) break;

    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      bool result = false;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = key;
        // Modifications to the bucket will not before this instruction.
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::DUPLICATE;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), false);
        break;
      } else if (bucket_size == bucket_capacity) {
        continue;
      }
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = BUCKET::keys(bucket_keys_ptr, possible_pos);
        K expected_key = static_cast<K>(EMPTY_KEY);
        result = current_key->compare_exchange_strong(
            expected_key, static_cast<K>(LOCKED_KEY),
            cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      } while (!result);
      if (result) {
        occupy_result = OccupyResult::OCCUPIED_EMPTY;
        key_pos = possible_pos;
        ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                         kv_idx, score, bucket_capacity,
                                         get_digest<K>(key), true);
        atomicAdd(bucket_size_ptr, 1);
        break;
      }
    }
  }
  while (occupy_result == OccupyResult::INITIAL) {
    S* bucket_scores_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, 0);
    S min_score = static_cast<S>(MAX_SCORE);
    int min_pos = -1;
#pragma unroll
    for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
      __pipeline_memcpy_async(sm_bucket_scores[tx] + j, bucket_scores_ptr + j,
                              sizeof(S) * Load_LEN_S);
    }
    __pipeline_commit();
    for (int i = 0; i < bucket_capacity; i += STRIDE_S) {
      if (i < bucket_capacity - STRIDE_S) {
#pragma unroll
        for (int j = 0; j < STRIDE_S; j += Load_LEN_S) {
          __pipeline_memcpy_async(
              sm_bucket_scores[tx] + diff_buf(i / STRIDE_S) * STRIDE_S + j,
              bucket_scores_ptr + i + STRIDE_S + j, sizeof(S) * Load_LEN_S);
        }
      }
      __pipeline_commit();
      __pipeline_wait_prior(1);
      S temp_scores[Load_LEN_S];
      S* src = sm_bucket_scores[tx] + same_buf(i / STRIDE_S) * STRIDE_S;
#pragma unroll
      for (int k = 0; k < STRIDE_S; k += Load_LEN_S) {
        *reinterpret_cast<byte16*>(temp_scores) =
            *reinterpret_cast<byte16*>(src + k);
#pragma unroll
        for (int j = 0; j < Load_LEN_S; j += 1) {
          S temp_score = temp_scores[j];
          if (temp_score < min_score) {
            auto verify_key_ptr = BUCKET::keys(bucket_keys_ptr, i + k + j);
            auto verify_key =
                verify_key_ptr->load(cuda::std::memory_order_relaxed);
            if (verify_key != static_cast<K>(LOCKED_KEY) &&
                verify_key != static_cast<K>(EMPTY_KEY)) {
              min_score = temp_score;
              min_pos = i + k + j;
            }
          }
        }
      }
    }
    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (score < min_score) {
      occupy_result = OccupyResult::REFUSED;
      evict_idx = atomicAdd(evicted_counter, 1);
      evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx, key,
                            score);
      break;
    }
    auto min_score_key = BUCKET::keys(bucket_keys_ptr, min_pos);
    auto expected_key = min_score_key->load(cuda::std::memory_order_relaxed);
    if (expected_key != static_cast<K>(LOCKED_KEY) &&
        expected_key != static_cast<K>(EMPTY_KEY)) {
      bool result = min_score_key->compare_exchange_strong(
          expected_key, static_cast<K>(LOCKED_KEY),
          cuda::std::memory_order_acquire, cuda::std::memory_order_relaxed);
      if (result) {
        S* min_score_ptr =
            BUCKET::scores(bucket_keys_ptr, bucket_capacity, min_pos);
        auto verify_score_ptr =
            reinterpret_cast<AtomicScore<S>*>(min_score_ptr);
        auto verify_score =
            verify_score_ptr->load(cuda::std::memory_order_relaxed);
        if (verify_score <= min_score) {
          key_pos = min_pos;
          ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores,
                                           kv_idx, score, bucket_capacity,
                                           get_digest<K>(key), true);
          if (expected_key == static_cast<K>(RECLAIM_KEY)) {
            occupy_result = OccupyResult::OCCUPIED_RECLAIMED;
            atomicAdd(bucket_size_ptr, 1);
          } else {
            occupy_result = OccupyResult::EVICT;
            evict_idx = atomicAdd(evicted_counter, 1);
            evict_key_score<K, S>(evicted_keys, evicted_scores, evict_idx,
                                  expected_key, min_score);
          }
        } else {
          min_score_key->store(expected_key, cuda::std::memory_order_release);
        }
      }
    }
  }
  VecV* bucket_value_ptr{nullptr};
  if (occupy_result != OccupyResult::ILLEGAL) {
    bucket_value_ptr = bucket_values_ptr + key_pos * dim;
  }
  uint32_t rank = g.thread_rank();

  for (int i = 0; i < GROUP_SIZE; i++) {
    auto occupy_result_cur = g.shfl(occupy_result, i);
    if (occupy_result_cur == OccupyResult::ILLEGAL) {
      continue;
    }
    auto kv_idx_cur = kv_idx / GROUP_SIZE * GROUP_SIZE + i;
    VecV const* input_buffer = values + kv_idx_cur * dim;
    auto evict_idx_cur = g.shfl(evict_idx, i);
    VecV* evict_buffer = evicted_values + evict_idx_cur * dim;
    VecV* table_buffer = g.shfl(bucket_value_ptr, i);
    if (occupy_result_cur == OccupyResult::EVICT) {
      for (int j = rank; j < dim; j += GROUP_SIZE) {
        evict_buffer[j] = table_buffer[j];
      }
    }
    if (occupy_result_cur == OccupyResult::REFUSED) {
      for (int j = rank; j < dim; j += GROUP_SIZE) {
        evict_buffer[j] = input_buffer[j];
      }
    } else {
      for (int j = rank; j < dim; j += GROUP_SIZE) {
        table_buffer[j] = input_buffer[j];
      }
      if (rank == i) {
        auto key_address = BUCKET::keys(bucket_keys_ptr, key_pos);
        // memory_order_release:
        // Modifications to the bucket will not after this instruction.
        key_address->store(key, cuda::std::memory_order_release);
      }
    }
  }
}

template <typename K, typename V, typename S, typename VecV, int Strategy>
struct LaunchInsertAndEvictKernel {
  using Params = Params_UpsertAndEvict<K, V, S>;
  inline static void launch(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    constexpr int GROUP_SIZE = 32;
    insert_and_evict_kernel_with_filter<K, V, S, VecV, BLOCK_SIZE, GROUP_SIZE,
                                        Strategy>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
            params.buckets, params.buckets_size, params.buckets_num,
            params.bucket_capacity, params.dim, params.keys,
            reinterpret_cast<const VecV*>(params.values), params.scores,
            params.evicted_keys, reinterpret_cast<VecV*>(params.evicted_values),
            params.evicted_scores, params.n, params.evicted_counter,
            params.global_epoch);
  }
};

template <typename K, typename V, typename S, int Strategy>
struct InsertAndEvictKernelLauncher {
  using Params = Params_UpsertAndEvict<K, V, S>;
  static void launch_kernel(Params& params, cudaStream_t& stream) {
    const uint32_t total_value_size =
        static_cast<uint32_t>(params.dim * sizeof(V));
    if (total_value_size % sizeof(byte16) == 0) {
      using VecV = byte16;
      LaunchInsertAndEvictKernel<K, V, S, VecV, Strategy>::launch(params,
                                                                  stream);
    } else if (total_value_size % sizeof(byte8) == 0) {
      using VecV = byte8;
      LaunchInsertAndEvictKernel<K, V, S, VecV, Strategy>::launch(params,
                                                                  stream);
    } else {
      using VecV = V;
      LaunchInsertAndEvictKernel<K, V, S, VecV, Strategy>::launch(params,
                                                                  stream);
    }
  }  // End function
};

template <typename K, typename V, typename S, typename VecV, int Strategy>
struct Launch_TLPv1_UpsertAndEvict {
  using Params = Params_UpsertAndEvict<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    tlp_v1_upsert_and_evict_kernel_unique<K, V, S, VecV, BLOCK_SIZE, Strategy>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
            params.buckets, params.buckets_size, params.buckets_num,
            params.bucket_capacity, params.dim, params.keys,
            reinterpret_cast<const VecV*>(params.values), params.scores,
            params.evicted_keys, reinterpret_cast<VecV*>(params.evicted_values),
            params.evicted_scores, params.n, params.evicted_counter,
            params.global_epoch);
  }
};

template <typename K, typename V, typename S, typename VecV, int Strategy>
struct Launch_TLPv2_UpsertAndEvict {
  using Params = Params_UpsertAndEvict<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    if (params.dim <= 8) {
      constexpr int GROUP_SIZE = 8;
      tlp_v2_upsert_and_evict_kernel_unique<K, V, S, VecV, BLOCK_SIZE,
                                            GROUP_SIZE, Strategy>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, params.buckets_size, params.buckets_num,
              params.bucket_capacity, params.dim, params.keys,
              reinterpret_cast<const VecV*>(params.values), params.scores,
              params.evicted_keys,
              reinterpret_cast<VecV*>(params.evicted_values),
              params.evicted_scores, params.n, params.evicted_counter,
              params.global_epoch);
    } else {
      constexpr int GROUP_SIZE = 16;
      tlp_v2_upsert_and_evict_kernel_unique<K, V, S, VecV, BLOCK_SIZE,
                                            GROUP_SIZE, Strategy>
          <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              params.buckets, params.buckets_size, params.buckets_num,
              params.bucket_capacity, params.dim, params.keys,
              reinterpret_cast<const VecV*>(params.values), params.scores,
              params.evicted_keys,
              reinterpret_cast<VecV*>(params.evicted_values),
              params.evicted_scores, params.n, params.evicted_counter,
              params.global_epoch);
    }
  }
};

template <typename K, typename V, typename S, typename VecV, int Strategy>
struct Launch_Pipeline_UpsertAndEvict {
  using Params = Params_UpsertAndEvict<K, V, S>;
  inline static void launch_kernel(Params& params, cudaStream_t& stream) {
    constexpr int BLOCK_SIZE = 128;
    constexpr uint32_t GROUP_SIZE = 32;
    constexpr uint32_t BUCKET_SIZE = 128;
    using SMM =
        SharedMemoryManager_Pipeline_UpsertAndEvict<K, V, S, VecV, BLOCK_SIZE,
                                                    GROUP_SIZE, BUCKET_SIZE>;

    params.dim = params.dim * sizeof(V) / sizeof(VecV);
    uint32_t shared_mem = SMM::total_size(params.dim);
    shared_mem =
        (shared_mem + sizeof(byte16) - 1) / sizeof(byte16) * sizeof(byte16);
    pipeline_upsert_and_evict_kernel_unique<K, V, S, VecV, BLOCK_SIZE, Strategy>
        <<<(params.n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, shared_mem,
           stream>>>(params.buckets, params.buckets_size, params.buckets_num,
                     params.dim, params.keys,
                     reinterpret_cast<const VecV*>(params.values),
                     params.scores, params.evicted_keys,
                     reinterpret_cast<VecV*>(params.evicted_values),
                     params.evicted_scores, params.n, params.evicted_counter,
                     params.global_epoch);
  }
};

template <typename ArchTag>
struct ValueConfig_UpsertAndEvict;

/// TODO: support more arch
template <>
struct ValueConfig_UpsertAndEvict<Sm80> {
  // Value size greater than it will bring poor performance for TLPv1.
  static constexpr uint32_t size_tlp_v1 = 16 * sizeof(byte4);
  // Value size greater than it will bring wrong result for TLPv2.
  static constexpr uint32_t size_tlp_v2 = 64 * sizeof(byte4);
  // Value size greater than it will reduce the occupancy for Pipeline.
  // When the value is very high, the kernel will fail to launch.
  static constexpr uint32_t size_pipeline = 128 * sizeof(byte4);
};

template <>
struct ValueConfig_UpsertAndEvict<Sm70> {
  // Value size greater than it will bring poor performance for TLPv1.
  static constexpr uint32_t size_tlp_v1 = 16 * sizeof(byte4);
  // Value size greater than it will bring wrong result for TLPv2.
  static constexpr uint32_t size_tlp_v2 = 32 * sizeof(byte4);
  // Value size greater than it will reduce the occupancy for Pipeline.
  // When the value is very high, the kernel will fail to launch.
  static constexpr uint32_t size_pipeline = 64 * sizeof(byte4);
};

template <typename K, typename V, typename S, int Strategy, typename ArchTag>
struct KernelSelector_UpsertAndEvict {
  using ValueConfig = ValueConfig_UpsertAndEvict<ArchTag>;
  using Params = Params_UpsertAndEvict<K, V, S>;

  static bool callable(bool unique_key, uint32_t bucket_size, uint32_t dim) {
    constexpr uint32_t MinBucketCap = sizeof(VecD_Load) / sizeof(D);
    if (!unique_key || bucket_size < MinBucketCap) return false;
    uint32_t value_size = dim * sizeof(V);
#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)
    if (value_size <= ValueConfig::size_tlp_v2) return true;
#else
    if (value_size <= ValueConfig::size_tlp_v1) return true;
#endif
    if (bucket_size == 128 && value_size <= ValueConfig::size_pipeline) {
      return true;
    }
    return false;
  }

  static void select_kernel(Params& params, cudaStream_t& stream) {
    const uint32_t total_value_size =
        static_cast<uint32_t>(params.dim * sizeof(V));

    auto launch_TLPv1 = [&]() {
      if (total_value_size % sizeof(byte16) == 0) {
        using VecV = byte16;
        Launch_TLPv1_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte8) == 0) {
        using VecV = byte8;
        Launch_TLPv1_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte4) == 0) {
        using VecV = byte4;
        Launch_TLPv1_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte2) == 0) {
        using VecV = byte2;
        Launch_TLPv1_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else {
        using VecV = byte;
        Launch_TLPv1_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      }
    };

#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)
    auto launch_TLPv2 = [&]() {
      if (total_value_size % sizeof(byte16) == 0) {
        using VecV = byte16;
        Launch_TLPv2_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte8) == 0) {
        using VecV = byte8;
        Launch_TLPv2_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte4) == 0) {
        using VecV = byte4;
        Launch_TLPv2_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte2) == 0) {
        using VecV = byte2;
        Launch_TLPv2_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else {
        using VecV = byte;
        Launch_TLPv2_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      }
    };
#endif

    auto launch_Pipeline = [&]() {
      if (total_value_size % sizeof(byte16) == 0) {
        using VecV = byte16;
        Launch_Pipeline_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte8) == 0) {
        using VecV = byte8;
        Launch_Pipeline_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte4) == 0) {
        using VecV = byte4;
        Launch_Pipeline_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else if (total_value_size % sizeof(byte2) == 0) {
        using VecV = byte2;
        Launch_Pipeline_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      } else {
        using VecV = byte;
        Launch_Pipeline_UpsertAndEvict<K, V, S, VecV, Strategy>::launch_kernel(
            params, stream);
      }
    };

    // This part is according to the test on A100.
    if (params.bucket_capacity != 128) {
      if (total_value_size <= ValueConfig::size_tlp_v1) {
        launch_TLPv1();
      } else {
#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)
        launch_TLPv2();
#else
        launch_TLPv1();
#endif
      }
    } else {
      if (total_value_size <= ValueConfig::size_tlp_v1) {
        if (params.load_factor <= 0.90f) {
          launch_TLPv1();
        } else {
          launch_Pipeline();
        }
      } else if (total_value_size <= ValueConfig::size_tlp_v2) {
        if (params.load_factor <= 0.85f) {
#if defined(CUDART_VERSION) && (CUDART_VERSION >= 11030)
          launch_TLPv2();
#else
          launch_Pipeline();
#endif
        } else {
          launch_Pipeline();
        }
      } else {
        launch_Pipeline();
      }
    }
  }  // End function
};

template <class K, class V, class S, int Strategy, uint32_t TILE_SIZE = 4>
__global__ void upsert_and_evict_kernel_with_io_core(
    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,
    const size_t bucket_max_size, const size_t buckets_num, const size_t dim,
    const K* __restrict keys, const V* __restrict values,
    const S* __restrict scores, K* __restrict evicted_keys,
    V* __restrict evicted_values, S* __restrict evicted_scores,
    const S global_epoch, size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int* buckets_size = table->buckets_size;

  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_pos = -1;
    const size_t key_idx = t / TILE_SIZE;

    const K insert_key = keys[key_idx];

    if (IS_RESERVED_KEY<K>(insert_key)) continue;

    const S insert_score =
        ScoreFunctor::desired_when_missed(scores, key_idx, global_epoch);
    const V* insert_value = values + key_idx * dim;

    size_t bkt_idx = 0;
    size_t start_idx = 0;
    int src_lane = -1;
    K evicted_key;

    Bucket<K, V, S>* bucket = get_key_position<K>(
        buckets, insert_key, bkt_idx, start_idx, buckets_num, bucket_max_size);

    OccupyResult occupy_result{OccupyResult::INITIAL};
    const int bucket_size = buckets_size[bkt_idx];
    do {
      if (bucket_size < bucket_max_size) {
        occupy_result = find_and_lock_when_vacant<K, V, S, TILE_SIZE>(
            g, bucket, insert_key, insert_score, evicted_key, start_idx,
            key_pos, src_lane, bucket_max_size);
      } else {
        start_idx = (start_idx / TILE_SIZE) * TILE_SIZE;
        occupy_result = find_and_lock_when_full<K, V, S, TILE_SIZE,
                                                ScoreFunctor::LOCK_MEM_ORDER,
                                                ScoreFunctor::UNLOCK_MEM_ORDER>(
            g, bucket, insert_key, insert_score, evicted_key, start_idx,
            key_pos, src_lane, bucket_max_size);
      }
      occupy_result = g.shfl(occupy_result, src_lane);
    } while (occupy_result == OccupyResult::CONTINUE);

    if (occupy_result == OccupyResult::REFUSED) {
      if (g.thread_rank() == 0) {
        evicted_keys[key_idx] = insert_key;
        evicted_scores[key_idx] = insert_score;
      }
      copy_vector<V, TILE_SIZE>(g, insert_value, evicted_values + key_idx * dim,
                                dim);
      continue;
    }

    if ((occupy_result == OccupyResult::OCCUPIED_EMPTY ||
         occupy_result == OccupyResult::OCCUPIED_RECLAIMED) &&
        g.thread_rank() == src_lane) {
      atomicAdd(&(buckets_size[bkt_idx]), 1);
    }

    if (occupy_result == OccupyResult::EVICT) {
      if (g.thread_rank() == src_lane) {
        evicted_keys[key_idx] = evicted_key;
        if (scores != nullptr) {
          evicted_scores[key_idx] = scores[key_idx];
        }
      }
      copy_vector<V, TILE_SIZE>(g, bucket->vectors + key_pos * dim,
                                evicted_values + key_idx * dim, dim);
    }

    copy_vector<V, TILE_SIZE>(g, insert_value, bucket->vectors + key_pos * dim,
                              dim);
    if (g.thread_rank() == src_lane) {
      ScoreFunctor::update(bucket, key_pos, scores, key_idx, insert_score,
                           (occupy_result != OccupyResult::DUPLICATE));
      bucket->digests(key_pos)[0] = get_digest<K>(insert_key);
      (bucket->keys(key_pos))
          ->store(insert_key, ScoreFunctor::UNLOCK_MEM_ORDER);
    }
  }
}

template <typename K, typename V, typename S, int Strategy>
struct SelectUpsertAndEvictKernelWithIO {
  static void execute_kernel(
      const float& load_factor, const int& block_size,
      const size_t bucket_max_size, const size_t buckets_num, const size_t dim,
      cudaStream_t& stream, const size_t& n,
      const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,
      const K* __restrict keys, const V* __restrict values,
      const S* __restrict scores, K* __restrict evicted_keys,
      V* __restrict evicted_values, S* __restrict evicted_scores,
      const S global_epoch) {
    if (load_factor <= 0.5) {
      const unsigned int tile_size = 4;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      upsert_and_evict_kernel_with_io_core<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, evicted_keys, evicted_values, evicted_scores,
              global_epoch, N);

    } else if (load_factor <= 0.875) {
      const unsigned int tile_size = 8;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

      upsert_and_evict_kernel_with_io_core<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, evicted_keys, evicted_values, evicted_scores,
              global_epoch, N);

    } else {
      const unsigned int tile_size = 32;
      const size_t N = n * tile_size;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);
      upsert_and_evict_kernel_with_io_core<K, V, S, Strategy, tile_size>
          <<<grid_size, block_size, 0, stream>>>(
              table, buckets, bucket_max_size, buckets_num, dim, keys, values,
              scores, evicted_keys, evicted_values, evicted_scores,
              global_epoch, N);
    }
    return;
  }
};

}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/core_kernels.cuh
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <cstdint>
#include <cub/cub.cuh>
#include <cuda/std/functional>
#include "allocator.cuh"
#include "core_kernels/kernel_utils.cuh"
#include "core_kernels/accum_or_assign.cuh"
#include "core_kernels/contains.cuh"
#include "core_kernels/find_or_insert.cuh"
#include "core_kernels/find_ptr_or_insert.cuh"
#include "core_kernels/lookup.cuh"
#include "core_kernels/lookup_ptr.cuh"
#include "core_kernels/update.cuh"
#include "core_kernels/update_score.cuh"
#include "core_kernels/update_values.cuh"
#include "core_kernels/upsert.cuh"
#include "core_kernels/upsert_and_evict.cuh"
// Dual-bucket headers depend on types from lookup.cuh and upsert.cuh
// (FoundFunctorV1, LookupValueBufConfig, Params_Upsert,
// SharedMemoryManager_Pipeline_Upsert), so they must come after.
#include "core_kernels/dual_bucket_utils.cuh"
#include "core_kernels/dual_bucket_upsert.cuh"
#include "core_kernels/dual_bucket_lookup.cuh"

namespace nv {
namespace merlin {

template <class S>
__global__ void create_locks(S* __restrict mutex, const size_t start,
                             const size_t end) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (start + tid < end) {
    new (mutex + start + tid) S();
  }
}

template <class S>
__global__ void release_locks(S* __restrict mutex, const size_t start,
                              const size_t end) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (start + tid < end) {
    (mutex + start + tid)->~S();
  }
}

template <class K, class V, class S>
__global__ void create_atomic_keys(Bucket<K, V, S>* __restrict buckets,
                                   const size_t start, const size_t end,
                                   const size_t bucket_max_size,
                                   const bool dual_bucket_mode = false) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (start + tid < end) {
    const D empty_d =
        dual_bucket_mode ? dual_bucket_empty_digest<K>() : empty_digest<K>();
    for (size_t i = 0; i < bucket_max_size; i++)
      buckets[start + tid].digests(i)[0] = empty_d;
    for (size_t i = 0; i < bucket_max_size; i++)
      new (buckets[start + tid].keys(i))
          AtomicKey<K>{static_cast<K>(EMPTY_KEY)};
  }
}

template <class K, class V, class S>
__global__ void create_atomic_scores(Bucket<K, V, S>* __restrict buckets,
                                     const size_t start, const size_t end,
                                     const size_t bucket_max_size) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  if (start + tid < end) {
    for (size_t i = 0; i < bucket_max_size; i++) {
      new (buckets[start + tid].scores(i))
          AtomicScore<S>{static_cast<S>(EMPTY_SCORE)};
    }
  }
}

template <class K, class V, class S>
__global__ void allocate_bucket_vectors(Bucket<K, V, S>* __restrict buckets,
                                        const size_t index, V* address) {
  buckets[index].vectors = address;
}

template <class K, class V, class S>
__global__ void allocate_bucket_others(Bucket<K, V, S>* __restrict buckets,
                                       size_t total_size_per_bucket,
                                       size_t num_of_buckets,
                                       const int start_index, uint8_t* address,
                                       const uint32_t reserve_size,
                                       const size_t bucket_max_size) {
  for (size_t step = 0; step < num_of_buckets; step++) {
    size_t index = start_index + step;
    buckets[index].digests_ = address;
    buckets[index].keys_ =
        reinterpret_cast<AtomicKey<K>*>(buckets[index].digests_ + reserve_size);
    buckets[index].scores_ = reinterpret_cast<AtomicScore<S>*>(
        buckets[index].keys_ + bucket_max_size);
    address += total_size_per_bucket;
  }
}

template <class K, class V, class S>
__global__ void get_bucket_others_address(Bucket<K, V, S>* __restrict buckets,
                                          const int index, uint8_t** address) {
  *address = buckets[index].digests_;
}

template <class P>
void realloc(P* ptr, size_t old_size, size_t new_size,
             BaseAllocator* allocator) {
  // Truncate old_size to limit dowstream copy ops.
  old_size = std::min(old_size, new_size);

  // Alloc new buffer and copy at old data.
  char* new_ptr;
  allocator->alloc(MemoryType::Device, (void**)&new_ptr, new_size);
  if (*ptr != nullptr) {
    CUDA_CHECK(cudaMemcpy(new_ptr, *ptr, old_size, cudaMemcpyDefault));
    allocator->free(MemoryType::Device, *ptr);
  }

  // Zero-fill remainder.
  CUDA_CHECK(cudaMemset(new_ptr + old_size, 0, new_size - old_size));

  // Switch to new pointer.
  *ptr = reinterpret_cast<P>(new_ptr);
  return;
}

template <class P>
void realloc_host(P* ptr, size_t old_size, size_t new_size,
                  BaseAllocator* allocator) {
  // Truncate old_size to limit dowstream copy ops.
  old_size = std::min(old_size, new_size);

  // Alloc new buffer and copy at old data.
  char* new_ptr = nullptr;
  allocator->alloc(MemoryType::Host, (void**)&new_ptr, new_size);

  if (*ptr != nullptr) {
    std::memcpy(new_ptr, *ptr, old_size);
    allocator->free(MemoryType::Host, *ptr);
  }

  // Zero-fill remainder.
  std::memset(new_ptr + old_size, 0, new_size - old_size);

  // Switch to new pointer.
  *ptr = reinterpret_cast<P>(new_ptr);
  return;
}

/* Initialize the buckets with index from start to end. */
template <class K, class V, class S>
void initialize_buckets(Table<K, V, S>** table, BaseAllocator* allocator,
                        const size_t start, const size_t end) {
  /* As testing results show us, when the number of buckets is greater than
   * the 4 million the performance will drop significantly, we believe the
   * to many pinned memory allocation causes this issue, so we change the
   * strategy to allocate some memory slices whose size is not greater than
   * 64GB, and put the buckets pointer point to the slices.
   */
  MERLIN_CHECK(start < end,
               "initialize_buckets, start should be less than end!");
  size_t buckets_num = end - start;
  const size_t total_size_of_vectors =
      buckets_num * (*table)->bucket_max_size * sizeof(V) * (*table)->dim;
  const size_t num_of_memory_slices =
      1 + (total_size_of_vectors - 1) / (*table)->bytes_per_slice;
  size_t num_of_buckets_in_one_slice =
      (*table)->bytes_per_slice /
      ((*table)->bucket_max_size * sizeof(V) * (*table)->dim);
  size_t num_of_allocated_buckets = 0;

  realloc_host<V**>(
      &((*table)->slices), (*table)->num_of_memory_slices * sizeof(V*),
      ((*table)->num_of_memory_slices + num_of_memory_slices) * sizeof(V*),
      allocator);

  bool mixed_hbm = false;
  for (size_t i = (*table)->num_of_memory_slices;
       i < (*table)->num_of_memory_slices + num_of_memory_slices; i++) {
    if (i == (*table)->num_of_memory_slices + num_of_memory_slices - 1) {
      num_of_buckets_in_one_slice = buckets_num - num_of_allocated_buckets;
    }
    size_t slice_real_size = num_of_buckets_in_one_slice *
                             (*table)->bucket_max_size * sizeof(V) *
                             (*table)->dim;
    if ((*table)->remaining_hbm_for_vectors >= slice_real_size) {
      if (!(*table)->is_pure_hbm) {
        mixed_hbm = true;
      }
      allocator->alloc(MemoryType::Device, (void**)&((*table)->slices[i]),
                       slice_real_size);
      (*table)->remaining_hbm_for_vectors -= slice_real_size;
    } else {
      (*table)->is_pure_hbm = false;
      allocator->alloc(MemoryType::Pinned, (void**)&((*table)->slices[i]),
                       slice_real_size, cudaHostAllocMapped);
    }
    for (int j = 0; j < num_of_buckets_in_one_slice; j++) {
      if ((*table)->is_pure_hbm || mixed_hbm) {
        size_t index = start + num_of_allocated_buckets + j;
        V* address =
            (*table)->slices[i] + j * (*table)->bucket_max_size * (*table)->dim;
        allocate_bucket_vectors<K, V, S>
            <<<1, 1>>>((*table)->buckets, index, address);
        CUDA_CHECK(cudaDeviceSynchronize());
      } else {
        V* h_ptr =
            (*table)->slices[i] + j * (*table)->bucket_max_size * (*table)->dim;
        V* address = nullptr;
        CUDA_CHECK(cudaHostGetDevicePointer(&address, h_ptr, 0));
        size_t index = start + num_of_allocated_buckets + j;
        allocate_bucket_vectors<K, V, S>
            <<<1, 1>>>((*table)->buckets, index, address);
      }
    }
    CUDA_CHECK(cudaDeviceSynchronize());
    num_of_allocated_buckets += num_of_buckets_in_one_slice;
  }

  (*table)->num_of_memory_slices += num_of_memory_slices;
  uint32_t bucket_max_size = static_cast<uint32_t>((*table)->bucket_max_size);
  size_t bucket_memory_size =
      bucket_max_size * (sizeof(AtomicKey<K>) + sizeof(AtomicScore<S>));
  // Align to the cache line size.
  constexpr uint32_t CACHE_LINE_SIZE = 128U / sizeof(uint8_t);
  uint32_t reserve_size =
      bucket_max_size < CACHE_LINE_SIZE ? CACHE_LINE_SIZE : bucket_max_size;
  bucket_memory_size += reserve_size * sizeof(uint8_t);

  MERLIN_CHECK(start % (*table)->num_of_buckets_per_alloc == 0,
               "initialize_buckets, start must be times of "
               "num_of_buckets_per_alloc!");
  /* NOTICE: Only the buckets which index is the times of
   * `num_of_buckets_per_alloc` will allocate a real address, that provides the
   * callers a method to avoid memory fragmentation.
   */
  for (int i = start; i < end; i += (*table)->num_of_buckets_per_alloc) {
    uint8_t* address = nullptr;
    size_t num_of_buckets =
        std::min(end - i, (*table)->num_of_buckets_per_alloc);
    allocator->alloc(MemoryType::Device, (void**)&(address),
                     bucket_memory_size * num_of_buckets);
    allocate_bucket_others<K, V, S>
        <<<1, 1>>>((*table)->buckets, bucket_memory_size, num_of_buckets, i,
                   address, reserve_size, bucket_max_size);
  }
  CUDA_CHECK(cudaDeviceSynchronize());

  {
    const size_t block_size = 512;
    const size_t N = end - start + 1;
    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);
    create_locks<Mutex><<<grid_size, block_size>>>((*table)->locks, start, end);
  }

  {
    const size_t block_size = 512;
    const size_t N = end - start + 1;
    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);
    create_atomic_keys<K, V, S><<<grid_size, block_size>>>(
        (*table)->buckets, start, end, (*table)->bucket_max_size,
        (*table)->dual_bucket_mode);
  }

  {
    const size_t block_size = 512;
    const size_t N = end - start + 1;
    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);
    create_atomic_scores<K, V, S><<<grid_size, block_size>>>(
        (*table)->buckets, start, end, (*table)->bucket_max_size);
  }
  CUDA_CHECK(cudaDeviceSynchronize());
  CudaCheckError();
}

template <class K, class V, class S>
size_t get_slice_size(Table<K, V, S>** table) {
  const size_t min_slice_size =
      (*table)->bucket_max_size * sizeof(V) * (*table)->dim;
  const size_t max_table_size = (*table)->max_size * sizeof(V) * (*table)->dim;
  size_t slice_size = 0;

  if (max_table_size >= GB(128)) {
    slice_size = GB(16);
  } else if (max_table_size >= GB(16)) {
    slice_size = GB(2);
  } else if (max_table_size >= GB(2)) {
    slice_size = MB(128);
  } else if (max_table_size >= MB(128)) {
    slice_size = MB(16);
  } else if (max_table_size >= MB(16)) {
    slice_size = MB(1);
  } else {
    slice_size = min_slice_size;
  }

  return std::max(min_slice_size, slice_size);
}

/* Initialize a Table struct.

   K: The key type
   V: The value type which should be static array type and C++ class
      with customized construct is not supported.
   S: The score type, the score will be used to store the timestamp
      or occurrence frequency or any thing for eviction.
   DIM: Vector dimension.
*/
template <class K, class V, class S>
void create_table(Table<K, V, S>** table, BaseAllocator* allocator,
                  const size_t dim, const size_t init_size = 134217728,
                  const size_t max_size = std::numeric_limits<size_t>::max(),
                  const size_t max_hbm_for_vectors = 0,
                  const size_t bucket_max_size = 128,
                  const size_t num_of_buckets_per_alloc = 1,
                  const size_t tile_size = 32, const bool primary = true,
                  const bool dual_bucket_mode = false) {
  allocator->alloc(MemoryType::Host, (void**)table, sizeof(Table<K, V, S>));
  std::memset(*table, 0, sizeof(Table<K, V, S>));
  (*table)->dual_bucket_mode = dual_bucket_mode;
  (*table)->dim = dim;
  (*table)->bucket_max_size = bucket_max_size;
  (*table)->max_size = std::max(init_size, max_size);
  (*table)->tile_size = tile_size;
  (*table)->is_pure_hbm = true;
  (*table)->bytes_per_slice = get_slice_size<K, V, S>(table);
  (*table)->num_of_buckets_per_alloc = num_of_buckets_per_alloc;

  // The bucket number will be the minimum needed for saving memory if no
  // rehash.
  if ((init_size * 2) > (*table)->max_size) {
    (*table)->buckets_num =
        1 + (((*table)->max_size - 1) / (*table)->bucket_max_size);
  } else {
    (*table)->buckets_num = 1;
    while ((*table)->buckets_num * (*table)->bucket_max_size < init_size) {
      (*table)->buckets_num *= 2;
    }
  }

  (*table)->capacity = (*table)->buckets_num * (*table)->bucket_max_size;
  (*table)->max_hbm_for_vectors = max_hbm_for_vectors;
  (*table)->remaining_hbm_for_vectors = max_hbm_for_vectors;
  (*table)->primary = primary;

  allocator->alloc(MemoryType::Device, (void**)&((*table)->locks),
                   (*table)->buckets_num * sizeof(Mutex));
  CUDA_CHECK(
      cudaMemset((*table)->locks, 0, (*table)->buckets_num * sizeof(Mutex)));

  allocator->alloc(MemoryType::Device, (void**)&((*table)->buckets_size),
                   (*table)->buckets_num * sizeof(int));
  CUDA_CHECK(cudaMemset((*table)->buckets_size, 0,
                        (*table)->buckets_num * sizeof(int)));

  allocator->alloc(MemoryType::Device, (void**)&((*table)->buckets),
                   (*table)->buckets_num * sizeof(Bucket<K, V, S>));
  CUDA_CHECK(cudaMemset((*table)->buckets, 0,
                        (*table)->buckets_num * sizeof(Bucket<K, V, S>)));

  initialize_buckets<K, V, S>(table, allocator, 0, (*table)->buckets_num);
  CudaCheckError();
}

/* Double the capacity on storage, must be followed by calling the
 * rehash_kernel. */
template <class K, class V, class S>
void double_capacity(Table<K, V, S>** table, BaseAllocator* allocator) {
  realloc<Mutex*>(&((*table)->locks), (*table)->buckets_num * sizeof(Mutex),
                  (*table)->buckets_num * sizeof(Mutex) * 2, allocator);
  realloc<int*>(&((*table)->buckets_size), (*table)->buckets_num * sizeof(int),
                (*table)->buckets_num * sizeof(int) * 2, allocator);

  realloc<Bucket<K, V, S>*>(
      &((*table)->buckets), (*table)->buckets_num * sizeof(Bucket<K, V, S>),
      (*table)->buckets_num * sizeof(Bucket<K, V, S>) * 2, allocator);

  initialize_buckets<K, V, S>(table, allocator, (*table)->buckets_num,
                              (*table)->buckets_num * 2);

  (*table)->capacity *= 2;
  (*table)->buckets_num *= 2;
}

/* free all of the resource of a Table. */
template <class K, class V, class S>
void destroy_table(Table<K, V, S>** table, BaseAllocator* allocator) {
  uint8_t** d_address = nullptr;
  CUDA_CHECK(cudaMalloc((void**)&d_address, sizeof(uint8_t*)));
  /* NOTICE: Only the buckets which index is the times of
   * `num_of_buckets_per_alloc` will hold a real address, and need to be freed
   */
  for (int i = 0; i < (*table)->buckets_num;
       i += (*table)->num_of_buckets_per_alloc) {
    uint8_t* h_address;
    get_bucket_others_address<K, V, S>
        <<<1, 1>>>((*table)->buckets, i, d_address);
    CUDA_CHECK(cudaMemcpy(&h_address, d_address, sizeof(uint8_t*),
                          cudaMemcpyDeviceToHost));
    allocator->free(MemoryType::Device, h_address);
  }
  CUDA_CHECK(cudaFree(d_address));

  for (int i = 0; i < (*table)->num_of_memory_slices; i++) {
    if (is_on_device((*table)->slices[i])) {
      allocator->free(MemoryType::Device, (*table)->slices[i]);
    } else {
      allocator->free(MemoryType::Pinned, (*table)->slices[i]);
    }
  }
  {
    const size_t block_size = 512;
    const size_t N = (*table)->buckets_num;
    const int grid_size = SAFE_GET_GRID_SIZE(N, block_size);
    release_locks<Mutex>
        <<<grid_size, block_size>>>((*table)->locks, 0, (*table)->buckets_num);
  }
  allocator->free(MemoryType::Host, (*table)->slices);
  allocator->free(MemoryType::Device, (*table)->buckets_size);
  allocator->free(MemoryType::Device, (*table)->buckets);
  allocator->free(MemoryType::Device, (*table)->locks);
  allocator->free(MemoryType::Host, *table);
  CUDA_CHECK(cudaDeviceSynchronize());
  CudaCheckError();
}

template <class K, class V, class S, uint32_t TILE_SIZE = 4>
__forceinline__ __device__ void defragmentation_for_rehash(
    Bucket<K, V, S>* __restrict bucket, uint32_t remove_pos,
    const size_t bucket_max_size, const size_t buckets_num, const size_t dim) {
  uint32_t key_idx;
  size_t global_idx = 0;
  size_t start_idx = 0;
  K find_key;
  K hashed_key;

  uint32_t empty_pos = remove_pos;

  int i = 1;
  while (i < bucket_max_size) {
    key_idx = (remove_pos + i) & (bucket_max_size - 1);
    find_key = (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);
    if (find_key == static_cast<K>(EMPTY_KEY)) {
      break;
    }
    hashed_key = Murmur3HashDevice(find_key);
    global_idx = hashed_key % (buckets_num * bucket_max_size);
    start_idx = get_start_position(global_idx, bucket_max_size);

    if ((start_idx <= empty_pos && empty_pos < key_idx) ||
        (key_idx < start_idx && start_idx <= empty_pos) ||
        (empty_pos <= key_idx && key_idx < start_idx)) {
      const K key =
          (*(bucket->keys(key_idx))).load(cuda::std::memory_order_relaxed);
      bucket->digests(empty_pos)[0] = get_digest<K>(key);
      (*(bucket->keys(empty_pos))).store(key, cuda::std::memory_order_relaxed);
      const S score =
          (*(bucket->scores(key_idx))).load(cuda::std::memory_order_relaxed);
      (*(bucket->scores(empty_pos)))
          .store(score, cuda::std::memory_order_relaxed);
      for (int j = 0; j < dim; j++) {
        bucket->vectors[empty_pos * dim + j] =
            bucket->vectors[key_idx * dim + j];
      }
      bucket->digests(key_idx)[0] = empty_digest<K>();
      (*(bucket->keys(key_idx)))
          .store(static_cast<K>(EMPTY_KEY), cuda::std::memory_order_relaxed);
      empty_pos = key_idx;
      remove_pos = key_idx;
      i = 1;
    } else {
      i++;
    }
  }
}

template <class K, class V, class S, uint32_t TILE_SIZE = 4>
__forceinline__ __device__ void move_key_to_new_bucket(
    cg::thread_block_tile<TILE_SIZE> g, int rank, const K& key, const S& score,
    const V* __restrict vector, Bucket<K, V, S>* __restrict new_bucket,
    const size_t new_bkt_idx, const size_t new_start_idx,
    int* __restrict buckets_size, const size_t bucket_max_size,
    const size_t buckets_num, const size_t dim) {
  uint32_t key_pos;
  unsigned empty_vote;
  int src_lane;

  for (uint32_t tile_offset = 0; tile_offset < bucket_max_size;
       tile_offset += TILE_SIZE) {
    size_t key_offset =
        (new_start_idx + tile_offset + rank) & (bucket_max_size - 1);
    const K current_key =
        (*(new_bucket->keys(key_offset))).load(cuda::std::memory_order_relaxed);
    empty_vote = g.ballot(current_key == static_cast<K>(EMPTY_KEY));
    if (empty_vote) {
      src_lane = __ffs(empty_vote) - 1;
      key_pos =
          (new_start_idx + tile_offset + src_lane) & (bucket_max_size - 1);
      if (rank == src_lane) {
        new_bucket->digests(key_pos)[0] = get_digest<K>(key);
        new_bucket->keys(key_pos)->store(key, cuda::std::memory_order_relaxed);
        new_bucket->scores(key_pos)->store(score,
                                           cuda::std::memory_order_relaxed);
        atomicAdd(&(buckets_size[new_bkt_idx]), 1);
      }
      copy_vector<V, TILE_SIZE>(g, vector, new_bucket->vectors + key_pos * dim,
                                dim);
      break;
    }
  }
}

template <class K, class V, class S, uint32_t TILE_SIZE = 4>
__global__ void rehash_kernel_for_fast_mode(
    const Table<K, V, S>* __restrict table, Bucket<K, V, S>* buckets,
    size_t N) {
  int* __restrict buckets_size = table->buckets_size;
  const size_t bucket_max_size = table->bucket_max_size;
  const size_t buckets_num = table->buckets_num;
  const size_t dim = table->dim;

  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int rank = g.thread_rank();
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  size_t global_idx;
  uint32_t start_idx = 0;
  K target_key = 0;
  S target_score = 0;

  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
    uint32_t bkt_idx = t / TILE_SIZE;
    Bucket<K, V, S>* bucket = (buckets + bkt_idx);

    lock<Mutex, TILE_SIZE>(g, table->locks[bkt_idx]);
    uint32_t key_idx = 0;
    while (key_idx < bucket_max_size) {
      key_idx = g.shfl(key_idx, 0);
      target_key =
          (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);
      target_score =
          bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);
      if (target_key != static_cast<K>(EMPTY_KEY) &&
          target_key != static_cast<K>(RECLAIM_KEY)) {
        const K hashed_key = Murmur3HashDevice(target_key);
        global_idx = hashed_key % (buckets_num * bucket_max_size);
        uint32_t new_bkt_idx = global_idx / bucket_max_size;
        if (new_bkt_idx != bkt_idx) {
          start_idx = get_start_position(global_idx, bucket_max_size);
          move_key_to_new_bucket<K, V, S, TILE_SIZE>(
              g, rank, target_key, target_score,
              (bucket->vectors + key_idx * dim), buckets + new_bkt_idx,
              new_bkt_idx, start_idx, buckets_size, bucket_max_size,
              buckets_num, table->dim);
          if (rank == 0) {
            bucket->digests(key_idx)[0] = empty_digest<K>();
            (bucket->keys(key_idx))
                ->store(static_cast<K>(EMPTY_KEY),
                        cuda::std::memory_order_relaxed);
            atomicSub(&(buckets_size[bkt_idx]), 1);
            defragmentation_for_rehash<K, V, S, TILE_SIZE>(
                bucket, key_idx, bucket_max_size, buckets_num / 2, dim);
            key_idx = 0;
          }
        } else {
          key_idx++;
        }
      } else {
        key_idx++;
      }
    }
    unlock<Mutex, TILE_SIZE>(g, table->locks[bkt_idx]);
  }
}

/* Read the N data from src to each address in *dst,
   usually called by upsert kernel.

   `src`: A pointer of pointer of V which should be on HBM,
          but each value (a pointer of V) could point to a
          memory on HBM or HMEM.
   `dst`: A continue memory pointer with Vector
          which should be HBM.
   `mask`: One for each `dst`. If true, reading from src,
           or false reading from default_val.
   `default_val`: Default value with shape (1, DIM) or (N, DIM)
   `N`: The number of vectors needed to be read.
   'full_size_default':
      If true, the d_def_val will be treated as
      a full size default value which shape must be (N, DIM).
*/
template <class K, class V, class S>
__global__ void read_kernel(const V* const* __restrict src, V* __restrict dst,
                            const bool* mask, const int* __restrict dst_offset,
                            const size_t dim, size_t N) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;

  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
    int vec_index = int(t / dim);
    int dim_index = t % dim;
    int real_dst_offset =
        dst_offset != nullptr ? dst_offset[vec_index] : vec_index;

    /// Copy selected values and fill in default value for all others.
    if (mask[real_dst_offset] && src[vec_index] != nullptr) {
      dst[real_dst_offset * dim + dim_index] = src[vec_index][dim_index];
    }
  }
}

/* Read the N data from src to each address in *dst,
 *  usually called by upsert kernel.
 *
 *  `src`: A pointer of pointer of V which should be on HBM,
 *         but each value (a pointer of V) could point to a
 *         memory on HBM or HMEM.
 *  `dst`: A continue memory pointer with Vector
 *         which should be HBM.
 *  `N`: Number of vectors needed to be read.
 */
template <class K, class V, class S>
__global__ void read_kernel(const V* const* __restrict src, V* __restrict dst,
                            const int* __restrict dst_offset, const size_t dim,
                            const size_t N) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;

  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
    int vec_index = int(t / dim);
    int real_dst_offset =
        dst_offset != nullptr ? dst_offset[vec_index] : vec_index;
    int dim_index = t % dim;
    if (src[vec_index] != nullptr) {
      dst[real_dst_offset * dim + dim_index] = src[vec_index][dim_index];
    }
  }
}

/* Clear all key-value in the table. */
template <class K, class V, class S>
__global__ void clear_kernel(Table<K, V, S>* __restrict table,
                             Bucket<K, V, S>* buckets, size_t N) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;
  const size_t bucket_max_size = table->bucket_max_size;
  const D empty_d = table->dual_bucket_mode ? dual_bucket_empty_digest<K>()
                                            : empty_digest<K>();

  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
    int key_idx = t % bucket_max_size;
    int bkt_idx = t / bucket_max_size;
    Bucket<K, V, S>* bucket = &(buckets[bkt_idx]);

    bucket->digests(key_idx)[0] = empty_d;
    (bucket->keys(key_idx))
        ->store(static_cast<K>(EMPTY_KEY), cuda::std::memory_order_relaxed);
    if (key_idx == 0) {
      table->buckets_size[bkt_idx] = 0;
    }
  }
}

/* Remove specified keys. */
template <class K, class V, class S, uint32_t TILE_SIZE = 4>
__global__ void remove_kernel(const Table<K, V, S>* __restrict table,
                              const K* __restrict keys,
                              Bucket<K, V, S>* __restrict buckets,
                              int* __restrict buckets_size,
                              const size_t bucket_max_size,
                              const size_t buckets_num, size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int rank = g.thread_rank();

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    int key_idx = t / TILE_SIZE;
    K find_key = keys[key_idx];
    if (IS_RESERVED_KEY<K>(find_key)) continue;

    int key_pos = -1;

    size_t bkt_idx = 0;
    size_t start_idx = 0;
    uint32_t tile_offset = 0;

    Bucket<K, V, S>* bucket = get_key_position<K>(
        buckets, find_key, bkt_idx, start_idx, buckets_num, bucket_max_size);

    unsigned found_vote = 0;
#pragma unroll
    for (tile_offset = 0; tile_offset < bucket_max_size;
         tile_offset += TILE_SIZE) {
      key_pos = (start_idx + tile_offset + rank) & (bucket_max_size - 1);

      const K current_key =
          (bucket->keys(key_pos))->load(cuda::std::memory_order_relaxed);

      found_vote = g.ballot(find_key == current_key);
      if (found_vote) {
        break;
      }

      if (g.any(current_key == static_cast<K>(EMPTY_KEY))) {
        break;
      }
    }

    if (found_vote) {
      const int src_lane = __ffs(found_vote) - 1;

      if (g.thread_rank() == src_lane) {
        const int key_pos =
            (start_idx + tile_offset + src_lane) & (bucket_max_size - 1);
        bucket->digests(key_pos)[0] = reclaim_digest<K>();
        (bucket->keys(key_pos))
            ->store(static_cast<K>(RECLAIM_KEY),
                    cuda::std::memory_order_relaxed);
        (bucket->scores(key_pos))
            ->store(static_cast<S>(EMPTY_SCORE),
                    cuda::std::memory_order_relaxed);
        atomicSub(&buckets_size[bkt_idx], 1);
      }
      break;
    }
  }
}

/* Remove specified keys which match the Predict. */
template <class K, class V, class S,
          template <typename, typename> class PredFunctor,
          uint32_t TILE_SIZE = 1>
__global__ void remove_kernel(const Table<K, V, S>* __restrict table,
                              const K pattern, const S threshold,
                              size_t* __restrict count,
                              Bucket<K, V, S>* __restrict buckets,
                              int* __restrict buckets_size,
                              const size_t bucket_max_size,
                              const size_t buckets_num, size_t N) {
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  PredFunctor<K, S> pred;

  for (size_t t = (blockIdx.x * blockDim.x) + threadIdx.x; t < N;
       t += blockDim.x * gridDim.x) {
    uint32_t bkt_idx = t;
    uint32_t key_pos = 0;

    Bucket<K, V, S>* bucket = buckets + bkt_idx;

    K current_key = 0;
    S current_score = 0;
    uint32_t key_offset = 0;
    while (key_offset < bucket_max_size) {
      current_key =
          bucket->keys(key_offset)->load(cuda::std::memory_order_relaxed);
      current_score =
          bucket->scores(key_offset)->load(cuda::std::memory_order_relaxed);
      if (!IS_RESERVED_KEY<K>(current_key)) {
        if (pred(current_key, current_score, pattern, threshold)) {
          atomicAdd(count, 1);
          key_pos = key_offset;
          bucket->digests(key_pos)[0] = reclaim_digest<K>();
          (bucket->keys(key_pos))
              ->store(static_cast<K>(RECLAIM_KEY),
                      cuda::std::memory_order_relaxed);
          (bucket->scores(key_pos))
              ->store(static_cast<S>(EMPTY_SCORE),
                      cuda::std::memory_order_relaxed);
          atomicSub(&buckets_size[bkt_idx], 1);
        } else {
          key_offset++;
        }
      } else {
        key_offset++;
      }
    }
  }
}

template <typename K, typename V, typename S, typename PredFunctor,
          uint32_t GroupSize = 32>
__global__ void remove_kernel_v2(const uint64_t search_length,
                                 const uint64_t offset, PredFunctor pred,
                                 Bucket<K, V, S>* buckets,
                                 int* __restrict buckets_size,
                                 const uint64_t bucket_capacity,
                                 const uint64_t dim, uint64_t* remove_counter) {
  cg::thread_block_tile<GroupSize> g =
      cg::tiled_partition<GroupSize>(cg::this_thread_block());

  uint64_t tid = static_cast<uint64_t>(blockIdx.x) * blockDim.x + threadIdx.x;

  for (uint64_t i = tid; i < search_length; i += gridDim.x * blockDim.x) {
    uint64_t bkt_idx = (i + offset) / bucket_capacity;
    uint64_t key_idx = (i + offset) % bucket_capacity;

    // May be different for threads within the same group.
    Bucket<K, V, S>* bucket = buckets + bkt_idx;

    const K key = bucket->keys(key_idx)->load(cuda::std::memory_order_relaxed);
    const S score =
        bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);
    const V* value = bucket->vectors + key_idx * dim;

    bool match = pred.template operator()<GroupSize>(key, value, score, g);
    if (IS_RESERVED_KEY<K>(key)) {
      match = false;
    }
    uint32_t vote = g.ballot(match);
    int group_cnt = __popc(vote);
    if (g.thread_rank() == 0) {
      atomicAdd(remove_counter, static_cast<uint64_t>(group_cnt));
      if (bucket_capacity >= GroupSize) {
        atomicSub(&buckets_size[bkt_idx], group_cnt);
      }
    }
    // Only matched threads need to erase.
    if (match) {
      bucket->digests(key_idx)[0] = reclaim_digest<K>();
      bucket->keys(key_idx)->store(static_cast<K>(RECLAIM_KEY),
                                   cuda::std::memory_order_relaxed);
      bucket->scores(key_idx)->store(static_cast<S>(EMPTY_SCORE),
                                     cuda::std::memory_order_relaxed);
      if (bucket_capacity < GroupSize) {
        atomicSub(&buckets_size[bkt_idx], 1);
      }
    }
  }
}

/* Dump with score. */
template <class K, class V, class S>
inline std::tuple<size_t, size_t> dump_kernel_shared_memory_size(
    const size_t available_shared_memory) {
  const size_t block_size{std::min(
      available_shared_memory / 2 / sizeof(KVM<K, V, S>), UINT64_C(1024))};
  MERLIN_CHECK(
      block_size > 0,
      "[HierarchicalKV] block_size <= 0, the K-V-S size may be too large!");

  return std::make_tuple(block_size * sizeof(KVM<K, V, S>), block_size);
}

template <class K, class V, class S>
__global__ void dump_kernel(const Table<K, V, S>* __restrict table,
                            Bucket<K, V, S>* buckets, K* d_key,
                            V* __restrict d_val, S* __restrict d_score,
                            const size_t offset, const size_t search_length,
                            size_t* d_dump_counter) {
  extern __shared__ unsigned char s[];
  KVM<K, V, S>* const block_tuples{reinterpret_cast<KVM<K, V, S>*>(s)};

  const size_t bucket_max_size{table->bucket_max_size};
  const size_t dim{table->dim};

  __shared__ size_t block_acc;
  __shared__ size_t global_acc;

  const size_t tid{blockIdx.x * blockDim.x + threadIdx.x};

  if (threadIdx.x == 0) {
    block_acc = 0;
  }
  __syncthreads();

  if (tid < search_length) {
    Bucket<K, V, S>* const bucket{&buckets[(tid + offset) / bucket_max_size]};

    const int key_idx{static_cast<int>((tid + offset) % bucket_max_size)};
    const K key{(bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed)};

    if (!IS_RESERVED_KEY<K>(key)) {
      size_t local_index{atomicAdd(&block_acc, 1)};
      block_tuples[local_index] = {
          key, &bucket->vectors[key_idx * dim],
          bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed)};
    }
  }
  __syncthreads();

  if (threadIdx.x == 0) {
    global_acc = atomicAdd(d_dump_counter, block_acc);
  }
  __syncthreads();

  if (threadIdx.x < block_acc) {
    const KVM<K, V, S>& tuple{block_tuples[threadIdx.x]};

    const size_t j{global_acc + threadIdx.x};
    d_key[j] = tuple.key;
    for (int i{0}; i < dim; ++i) {
      d_val[j * dim + i] = tuple.value[i];
    }
    if (d_score != nullptr) {
      d_score[j] = tuple.score;
    }
  }
}

/* Dump with score. */
template <class K, class V, class S,
          template <typename, typename> class PredFunctor>
__global__ void dump_kernel(const Table<K, V, S>* __restrict table,
                            Bucket<K, V, S>* buckets, const K pattern,
                            const S threshold, K* d_key, V* __restrict d_val,
                            S* __restrict d_score, const size_t offset,
                            const size_t search_length,
                            size_t* d_dump_counter) {
  extern __shared__ unsigned char s[];
  const size_t bucket_max_size = table->bucket_max_size;
  const size_t dim = table->dim;
  K* smem = (K*)s;
  K* block_result_key = smem;
  V* block_result_val = (V*)&(smem[blockDim.x]);
  S* block_result_score = (S*)&(block_result_val[blockDim.x * dim]);
  __shared__ size_t block_acc;
  __shared__ size_t global_acc;
  PredFunctor<K, S> pred;

  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;

  if (threadIdx.x == 0) {
    block_acc = 0;
  }
  __syncthreads();

  if (tid < search_length) {
    int bkt_idx = (tid + offset) / bucket_max_size;
    int key_idx = (tid + offset) % bucket_max_size;
    Bucket<K, V, S>* bucket = &(buckets[bkt_idx]);

    const K key =
        (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);
    S score = bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);

    if (!IS_RESERVED_KEY<K>(key) && pred(key, score, pattern, threshold)) {
      size_t local_index = atomicAdd(&block_acc, 1);
      block_result_key[local_index] = key;
      for (int i = 0; i < dim; i++) {
        atomicExch(&(block_result_val[local_index * dim + i]),
                   bucket->vectors[key_idx * dim + i]);
      }
      if (d_score != nullptr) {
        block_result_score[local_index] = score;
      }
    }
  }
  __syncthreads();

  if (threadIdx.x == 0) {
    global_acc = atomicAdd(d_dump_counter, block_acc);
  }
  __syncthreads();

  if (threadIdx.x < block_acc) {
    d_key[global_acc + threadIdx.x] = block_result_key[threadIdx.x];
    for (int i = 0; i < dim; i++) {
      d_val[(global_acc + threadIdx.x) * dim + i] =
          block_result_val[threadIdx.x * dim + i];
    }
    if (d_score != nullptr) {
      d_score[global_acc + threadIdx.x] = block_result_score[threadIdx.x];
    }
  }
}

template <class K, class V, class S, class VecV,
          template <typename, typename> class PredFunctor, int TILE_SIZE>
__global__ void dump_kernel_v2(const Table<K, V, S>* __restrict table,
                               Bucket<K, V, S>* buckets, const K pattern,
                               const S threshold, K* d_key, V* __restrict d_val,
                               S* __restrict d_score, const size_t offset,
                               const size_t search_length,
                               size_t* d_dump_counter) {
  const size_t bucket_max_size = table->bucket_max_size;
  int vec_dim = table->dim * sizeof(V) / sizeof(VecV);
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());

  PredFunctor<K, S> pred;
  size_t tid = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;

  for (size_t i = tid; i < search_length; i += gridDim.x * blockDim.x) {
    size_t bkt_idx = (i + offset) / bucket_max_size;
    size_t key_idx = (i + offset) % bucket_max_size;
    size_t leading_key_idx = key_idx / TILE_SIZE * TILE_SIZE;
    Bucket<K, V, S>* bucket = &(buckets[bkt_idx]);

    const K key =
        (bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed);
    S score = bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);

    bool match =
        (!IS_RESERVED_KEY<K>(key)) && pred(key, score, pattern, threshold);
    unsigned int vote = g.ballot(match);
    int tile_cnt = __popc(vote);
    size_t tile_offset = 0;
    if (g.thread_rank() == 0) {
      tile_offset = atomicAdd(d_dump_counter, static_cast<size_t>(tile_cnt));
    }
    tile_offset = g.shfl(tile_offset, 0);
    int bias_g = tile_cnt - __popc(vote >> (key_idx % TILE_SIZE));

    if (match) {
      d_key[tile_offset + bias_g] = key;
      if (d_score) {
        d_score[tile_offset + bias_g] = score;
      }
    }

#pragma unroll
    for (int r = 0; r < TILE_SIZE; r++) {
      unsigned int biased_vote = vote >> r;
      bool cur_match = biased_vote & 1;
      if (cur_match) {
        int bias = tile_cnt - __popc(biased_vote);
        size_t cur_idx = leading_key_idx + r;

        VecV* d_val_vec = reinterpret_cast<VecV*>(d_val);
        VecV* vec = reinterpret_cast<VecV*>(bucket->vectors);
        for (int j = g.thread_rank(); j < vec_dim; j += TILE_SIZE) {
          d_val_vec[(tile_offset + bias) * vec_dim + j] =
              vec[cur_idx * vec_dim + j];
        }
      }
    }
  }
}

template <typename K, typename V, typename S, typename PredFunctor,
          uint32_t GroupSize = 32>
__global__ void dump_kernel(const uint64_t search_length, const uint64_t offset,
                            PredFunctor pred, Bucket<K, V, S>* buckets,
                            const uint64_t bucket_capacity, const uint64_t dim,
                            K* __restrict__ out_keys, V* __restrict__ out_vals,
                            S* __restrict__ out_scores,
                            uint64_t* dump_counter) {
  cg::thread_block_tile<GroupSize> g =
      cg::tiled_partition<GroupSize>(cg::this_thread_block());

  uint64_t tid = static_cast<uint64_t>(blockIdx.x) * blockDim.x + threadIdx.x;

  for (uint64_t i = tid; i < search_length; i += gridDim.x * blockDim.x) {
    uint64_t bkt_idx = (i + offset) / bucket_capacity;
    uint64_t key_idx = (i + offset) % bucket_capacity;

    // May be different for threads within the same group.
    Bucket<K, V, S>* bucket = buckets + bkt_idx;

    const K key = bucket->keys(key_idx)->load(cuda::std::memory_order_relaxed);
    const S score =
        bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);
    const V* value = bucket->vectors + key_idx * dim;

    bool match = pred.template operator()<GroupSize>(key, value, score, g);
    uint32_t vote = g.ballot(match);
    int group_cnt = __popc(vote);
    uint64_t group_offset = 0;
    if (g.thread_rank() == 0) {
      group_offset = atomicAdd(dump_counter, static_cast<uint64_t>(group_cnt));
    }
    group_offset = g.shfl(group_offset, 0);
    // Each thread gets the count of previous matches ranks.
    // Using `g.thread_rank()` instead of `key_idx % GroupSize` to handle case:
    // bucket_capacity < GroupSize.
    int previous_cnt = group_cnt - __popc(vote >> g.thread_rank());
    // Only matched threads need to output.
    if (match) {
      out_keys[group_offset + previous_cnt] = key;
      if (out_scores) {
        out_scores[group_offset + previous_cnt] = score;
      }
    }

    for (int r = 0; r < GroupSize; r++) {
      uint32_t biased_vote = vote >> r;
      bool cur_match = biased_vote & 1;
      if (cur_match) {
        int bias = group_cnt - __popc(biased_vote);

        /// TODO:timing them
        //----------------------- Solution 1
        // uint64_t cur_bkt_idx = g.shfl(bkt_idx, r);
        // uint64_t cur_key_idx = g.shfl(key_idx, r);
        // auto cur_bucket = buckets + cur_bkt_idx;
        //----------------------- Solution 2
        uint64_t cur_idx = (i / GroupSize) * GroupSize + r + offset;
        uint64_t cur_bkt_idx = cur_idx / bucket_capacity;
        uint64_t cur_key_idx = cur_idx % bucket_capacity;
        Bucket<K, V, S>* cur_bucket = buckets + cur_bkt_idx;

        for (int j = g.thread_rank(); j < dim; j += GroupSize) {
          out_vals[(group_offset + bias) * dim + j] =
              cur_bucket->vectors[cur_key_idx * dim + j];
        }
      }
    }
  }
}

template <class K, class V, class S,
          template <typename, typename> class PredFunctor>
__global__ void size_if_kernel(const Table<K, V, S>* __restrict table,
                               Bucket<K, V, S>* buckets, const K pattern,
                               const S threshold, size_t* d_counter) {
  extern __shared__ unsigned char s[];

  const size_t bucket_max_size{table->bucket_max_size};

  size_t local_acc = 0;
  __shared__ size_t block_acc;
  PredFunctor<K, S> pred;

  const size_t tid{blockIdx.x * blockDim.x + threadIdx.x};

  if (threadIdx.x == 0) {
    block_acc = 0;
  }
  __syncthreads();

  for (size_t i = tid; i < table->capacity; i += blockDim.x * gridDim.x) {
    Bucket<K, V, S>* const bucket{&buckets[i / bucket_max_size]};

    const int key_idx{static_cast<int>(i % bucket_max_size)};
    const K key{(bucket->keys(key_idx))->load(cuda::std::memory_order_relaxed)};
    S score = bucket->scores(key_idx)->load(cuda::std::memory_order_relaxed);

    if ((!IS_RESERVED_KEY(key)) && pred(key, score, pattern, threshold)) {
      ++local_acc;
    }
  }
  atomicAdd(&block_acc, local_acc);
  __syncthreads();

  if (threadIdx.x == 0) {
    atomicAdd(d_counter, block_acc);
  }
}

template <typename K, typename V, typename S, typename ExecutionFunc,
          uint32_t GroupSize = 32>
__global__ void traverse_kernel(const uint64_t search_length,
                                const uint64_t offset, ExecutionFunc f,
                                Bucket<K, V, S>* buckets,
                                const uint64_t bucket_capacity,
                                const uint64_t dim) {
  cg::thread_block_tile<GroupSize> g =
      cg::tiled_partition<GroupSize>(cg::this_thread_block());

  uint64_t tid = static_cast<uint64_t>(blockIdx.x) * blockDim.x + threadIdx.x;

  for (uint64_t i = tid; i < search_length; i += gridDim.x * blockDim.x) {
    uint64_t bkt_idx = (i + offset) / bucket_capacity;
    uint64_t key_idx = (i + offset) % bucket_capacity;

    // May be different for threads within the same group.
    Bucket<K, V, S>* bucket = buckets + bkt_idx;

    const K key = bucket->keys(key_idx)->load(cuda::std::memory_order_relaxed);
    S* score = reinterpret_cast<S*>(bucket->scores(key_idx));
    V* value = bucket->vectors + key_idx * dim;

    f.template operator()<GroupSize>(key, value, score, g);
  }
}

template <typename K>
__global__ void unlock_keys_kernel(uint64_t n, K** __restrict__ locked_key_ptrs,
                                   const K* __restrict__ keys,
                                   bool* __restrict__ succeededs) {
  int kv_idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (kv_idx < n) {
    K* locked_key_ptr = locked_key_ptrs[kv_idx];
    bool flag = true;
    if (locked_key_ptr != nullptr) {
      K locked_key = *locked_key_ptr;
      K expected_key = static_cast<K>(LOCKED_KEY);
      K key = keys[kv_idx];
      if (locked_key == expected_key) {
        *locked_key_ptr = key;
      } else {
        flag = false;
      }
    } else {
      flag = false;
    }
    if (succeededs != nullptr) {
      succeededs[kv_idx] = flag;
    }
  }
}

template <typename K, typename V, typename S, typename Tidx, int TILE_SIZE = 8>
__global__ void compact_key_value_score_kernel(
    const bool* masks, size_t n, const Tidx* offsets,
    K* __restrict const src_keys, V* __restrict const src_values,
    S* __restrict const src_scores, K* __restrict dst_keys,
    V* __restrict dst_values, S* __restrict dst_scores, const size_t dim) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  auto g = cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
  int rank = g.thread_rank();

  bool is_existed = false;
  if (tid < n) {
    if (masks[tid]) {
      is_existed = true;
    }
  }
  unsigned int vote = g.ballot(is_existed);
  unsigned int r_vote = __brev(vote) >> (32 - TILE_SIZE);
  K empty_key = (K)EMPTY_KEY;
  Tidx bias;
  if (is_existed) {
    r_vote = r_vote >> (TILE_SIZE - rank - 1);
    int prefix_n = __popc(r_vote) - 1;
    bias = offsets[tid / TILE_SIZE] + static_cast<Tidx>(prefix_n);
    dst_keys[bias] = src_keys[tid];
    if (src_scores and dst_scores) dst_scores[bias] = src_scores[tid];
  }

  int group_offset = (tid / TILE_SIZE) * TILE_SIZE;
  for (int i = 0; i < TILE_SIZE; i++) {
    if (group_offset + i >= n) return;
    auto cur_existed = g.shfl(is_existed, i);
    if (cur_existed) {
      auto cur_bias = g.shfl(bias, i);
      for (size_t j = rank; j < dim; j += TILE_SIZE) {
        dst_values[dim * cur_bias + j] =
            src_values[dim * (group_offset + i) + j];
      }
    }
  }
}

template <typename K, typename V, typename S, int Strategy = -1>
__global__ void lock_kernel_with_filter(
    Bucket<K, V, S>* __restrict__ buckets, uint64_t const buckets_num,
    uint32_t bucket_capacity, uint32_t const dim, K const* __restrict__ keys,
    K** __restrict locked_keys_ptr, bool* __restrict succeed,
    S const* __restrict__ scores, const S global_epoch, uint64_t n) {
  using BUCKET = Bucket<K, V, S>;
  using ScoreFunctor = ScoreFunctor<K, V, S, Strategy>;
  // Load `STRIDE` digests every time.
  constexpr uint32_t STRIDE = sizeof(VecD_Load) / sizeof(D);

  uint32_t tx = threadIdx.x;
  uint32_t kv_idx = blockIdx.x * blockDim.x + tx;
  K key{static_cast<K>(EMPTY_KEY)};
  S score{static_cast<S>(EMPTY_SCORE)};
  OccupyResult occupy_result{OccupyResult::INITIAL};
  VecD_Comp target_digests{0};
  K* bucket_keys_ptr{nullptr};
  uint32_t key_pos = {0};
  if (kv_idx < n) {
    key = keys[kv_idx];
    score = ScoreFunctor::desired_when_missed(scores, kv_idx, global_epoch);
    if (!IS_RESERVED_KEY<K>(key)) {
      const K hashed_key = Murmur3HashDevice(key);
      target_digests = digests_from_hashed<K>(hashed_key);
      uint64_t global_idx =
          static_cast<uint64_t>(hashed_key % (buckets_num * bucket_capacity));
      key_pos = get_start_position(global_idx, bucket_capacity);
      uint64_t bkt_idx = global_idx / bucket_capacity;
      BUCKET* bucket = buckets + bkt_idx;
      bucket_keys_ptr = reinterpret_cast<K*>(bucket->keys(0));
    } else {
      occupy_result = OccupyResult::ILLEGAL;
      goto WRITE_BACK;
    }
  } else {
    return;
  }

  // One more loop to handle empty keys.
  for (int offset = 0; offset < bucket_capacity + STRIDE; offset += STRIDE) {
    uint32_t pos_cur = align_to<STRIDE>(key_pos);
    pos_cur = (pos_cur + offset) & (bucket_capacity - 1);

    D* digests_ptr = BUCKET::digests(bucket_keys_ptr, bucket_capacity, pos_cur);
    VecD_Load digests_vec = *(reinterpret_cast<VecD_Load*>(digests_ptr));
    VecD_Comp digests_arr[4] = {digests_vec.x, digests_vec.y, digests_vec.z,
                                digests_vec.w};

    for (int i = 0; i < 4; i++) {
      VecD_Comp probe_digests = digests_arr[i];
      uint32_t possible_pos = 0;
      // Perform a vectorized comparison by byte,
      // and if they are equal, set the corresponding byte in the result to
      // 0xff.
      int cmp_result = __vcmpeq4(probe_digests, target_digests);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        // CUDA uses little endian,
        // and the lowest byte in register stores in the lowest address.
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        auto current_key = bucket_keys_ptr[possible_pos];
        if (current_key == key) {
          key_pos = possible_pos;
          occupy_result = OccupyResult::DUPLICATE;
          goto WRITE_BACK;
        }
      } while (true);
      VecD_Comp empty_digests_ = empty_digests<K>();
      cmp_result = __vcmpeq4(probe_digests, empty_digests_);
      cmp_result &= 0x01010101;
      do {
        if (cmp_result == 0) break;
        uint32_t index = (__ffs(cmp_result) - 1) >> 3;
        cmp_result &= (cmp_result - 1);
        possible_pos = pos_cur + i * 4 + index;
        if (offset == 0 && possible_pos < key_pos) continue;
        auto current_key = bucket_keys_ptr[possible_pos];
        if (current_key == static_cast<K>(EMPTY_KEY)) {
          occupy_result = OccupyResult::OCCUPIED_EMPTY;
          goto WRITE_BACK;
        }
      } while (true);
    }
  }

WRITE_BACK:
  bool found_ = occupy_result == OccupyResult::DUPLICATE;
  if (found_) {
    auto current_key = BUCKET::keys(bucket_keys_ptr, key_pos);
    K expected_key = key;
    // Modifications to the bucket will not before this instruction.
    bool result = current_key->compare_exchange_strong(
        expected_key, static_cast<K>(LOCKED_KEY),
        cuda::std::memory_order_relaxed, cuda::std::memory_order_relaxed);
    if (not result) {
      found_ = false;
    } else {
      ScoreFunctor::update_with_digest(bucket_keys_ptr, key_pos, scores, kv_idx,
                                       score, bucket_capacity,
                                       get_digest<K>(key), false);
    }
  }
  if (found_) {
    locked_keys_ptr[kv_idx] = bucket_keys_ptr + key_pos;
  } else {
    locked_keys_ptr[kv_idx] = nullptr;
  }
  if (succeed) {
    succeed[kv_idx] = found_;
  }
}

template <typename KeyT, typename ValueT>
struct SortPairOp {
  SortPairOp() : d_temp_storage(nullptr), temp_storage_bytes(0) {}

  size_t get_storage_bytes(int batch, cudaStream_t stream) {
    num_items = batch;
    cub::DeviceRadixSort::SortPairs<KeyT, ValueT>(
        d_temp_storage, temp_storage_bytes, nullptr, nullptr, nullptr, nullptr,
        num_items, 0, sizeof(KeyT) * 8, stream);

    return temp_storage_bytes;
  }

  void set_storage(void* storage) { d_temp_storage = storage; }

  void sort(int batch, KeyT const* d_keys_in, KeyT* d_keys_out,
            ValueT const* d_values_in, ValueT* d_values_out,
            cudaStream_t stream) {
    if (batch != num_items) {
      throw std::runtime_error("Number of items is not matched when sort.");
    }
    cub::DeviceRadixSort::SortPairs(
        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in,
        d_values_out, num_items, 0, sizeof(KeyT) * 8, stream);
  }

  void* d_temp_storage{nullptr};
  size_t temp_storage_bytes{0};
  int num_items{0};
};

template <typename InputT, typename OutputT>
struct SumOp {
  using InputIteratorT = InputT const*;
  using OutputIteratorT = OutputT*;
  SumOp() : d_temp_storage(nullptr), temp_storage_bytes(0) {}

  size_t get_storage_bytes(int batch, cudaStream_t stream) {
    num_items = batch;
    cub::DeviceReduce::Reduce<InputIteratorT, OutputIteratorT>(
        d_temp_storage, temp_storage_bytes, nullptr, nullptr, num_items,
        cuda::std::plus<>(), 0, stream);
    return temp_storage_bytes;
  }

  void set_storage(void* storage) { d_temp_storage = storage; }

  void sum(int batch, InputIteratorT d_in, OutputIteratorT d_out,
           cudaStream_t stream) {
    if (batch != num_items) {
      throw std::runtime_error("Number of items is not matched when sum.");
    }
    cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
                              num_items, cuda::std::plus<>(), 0, stream);
  }

  void* d_temp_storage{nullptr};
  size_t temp_storage_bytes{0};
  int num_items{0};
};

}  // namespace merlin
}  // namespace nv


================================================
FILE: include/merlin/debug.hpp
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <cuda_runtime_api.h>
#include <sstream>
#include <stdexcept>
#include <string>

namespace nv {
namespace merlin {

class CudaException : public std::runtime_error {
 public:
  CudaException(const std::string& what) : runtime_error(what) {}
};

inline void cuda_check_(cudaError_t val, const char* file, int line) {
  if (val != cudaSuccess) {
    std::ostringstream os;
    os << file << ':' << line << ": CUDA error " << cudaGetErrorName(val)
       << " (#" << val << "): " << cudaGetErrorString(val);
    throw CudaException(os.str());
  }
}

#ifdef CUDA_CHECK
#error Unexpected redfinition of CUDA_CHECK! Something is wrong.
#endif

#define CUDA_CHECK(val)                                 \
  do {                                                  \
    nv::merlin::cuda_check_((val), __FILE__, __LINE__); \
  } while (0)

class MerlinException : public std::runtime_error {
 public:
  MerlinException(const std::string& what) : runtime_error(what) {}
};

template <class Msg>
inline void merlin_check_(bool cond, const Msg& msg, const char* file,
                          int line) {
  if (!cond) {
    std::ostringstream os;
    os << file << ':' << line << ": HierarchicalKV error " << msg;
    throw MerlinException(os.str());
  }
}

#ifdef MERLIN_CHECK
#error Unexpected redfinition of MERLIN_CHECK! Something is wrong.
#endif

#define MERLIN_CHECK(cond, msg)                                   \
  do {                                                            \
    nv::merlin::merlin_check_((cond), (msg), __FILE__, __LINE__); \
  } while (0)

}  // namespace merlin
}  // namespace nv


================================================
FILE: include/merlin/flexible_buffer.cuh
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <iostream>
#include "utils.cuh"

using std::cerr;
using std::endl;

namespace nv {
namespace merlin {

template <class T>
class FlexPinnedBuffer {
 public:
  FlexPinnedBuffer(const size_t size = 1) : ptr_(nullptr) {
    if (!ptr_) {
      size_ = size;
      CUDA_CHECK(cudaMallocHost(&ptr_, sizeof(T) * size_));
    }
  }
  ~FlexPinnedBuffer() {
    try {
      if (ptr_) CUDA_CHECK(cudaFreeHost(ptr_));
    } catch (const nv::merlin::CudaException& e) {
      cerr << "[HierarchicalKV] Failed to free FlexPinnedBuffer!" << endl;
    }
  }

  __inline__ T* alloc_or_reuse(const size_t size = 0) {
    if (size > size_) {
      CUDA_CHECK(cudaFreeHost(ptr_));
      size_ = size;
      CUDA_CHECK(cudaMallocHost(&ptr_, sizeof(T) * size_));
    }
    return ptr_;
  }

 private:
  T* ptr_;
  size_t size_;
};

}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/group_lock.cuh
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http:///www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once
#include <atomic>
#include <cassert>
#include <mutex>
#include <system_error>
#include <thread>
#include "core_kernels/group_lock_kernels.cuh"
#include "utils.cuh"

namespace nv {
namespace merlin {

/*
 * Thread-local cached CUDA stream for lock acquisition kernels.
 * Eliminates per-call cudaStreamCreate/Destroy overhead which causes
 * CUDA driver contention when multiple threads acquire locks concurrently.
 */
inline cudaStream_t get_lock_stream() {
  thread_local cudaStream_t stream = [] {
    cudaStream_t s;
    CUDA_CHECK(cudaStreamCreate(&s));
    return s;
  }();
  return stream;
}

/*
 * Implementing a triple-group, mutex and relative lock guard for better E2E
 * performance:
 * - There are three roles: `inserter`, `updater`, and `reader`.
 * - Allow only one inserter to be executed concurrently.  (like
 * `insert_or_assign` 'insert_and_evict`, `find_or_insert` etc.).
 * - Allow multiple updaters to be executed concurrently. (like `assign`, etc.)
 * The CUDA kernels guarantee the data consistency in this situation.
 * - Allow multiple readers to be executed concurrently. (like `find` 'size`
 * etc.)
 * - Not allow inserter, readers and updaters to run concurrently
 * - The `update_read_lock` is exclusive and used for special APIs (like
 * `reserve` `erase` `clear` etc.)
 */
class group_shared_mutex {
 public:
  group_shared_mutex(const group_shared_mutex&) = delete;
  group_shared_mutex& operator=(const group_shared_mutex&) = delete;

  group_shared_mutex() noexcept
      : h_update_count_(0), h_read_count_(0), h_unique_flag_(false) {
    CUDA_CHECK(
        cudaMalloc(&d_update_count_,
                   sizeof(cuda::atomic<int, cuda::thread_scope_device>)));
    CUDA_CHECK(cudaMalloc(
        &d_read_count_, sizeof(cuda::atomic<int, cuda::thread_scope_device>)));
    CUDA_CHECK(
        cudaMalloc(&d_unique_flag_,
                   sizeof(cuda::atomic<bool, cuda::thread_scope_device>)));
    group_lock::init_kernel<<<1, 1, 0>>>(d_update_count_, d_read_count_,
                                         d_unique_flag_);
    CUDA_CHECK(cudaDeviceSynchronize());
  }

  ~group_shared_mutex() noexcept {
    CUDA_CHECK(cudaDeviceSynchronize());
    CUDA_CHECK(cudaFree(d_update_count_));
    CUDA_CHECK(cudaFree(d_read_count_));
    CUDA_CHECK(cudaFree(d_unique_flag_));
  }

  void lock_read() {
    for (;;) {
      while (h_update_count_.load(std::memory_order_acquire)) {
      }
      h_read_count_.fetch_add(1, std::memory_order_acq_rel);
      if (h_update_count_.load(std::memory_order_acquire) == 0) {
        {
          cudaStream_t stream = get_lock_stream();
          group_lock::lock_read_kernel<<<1, 1, 0, stream>>>(d_update_count_,
                                                            d_read_count_);
          CUDA_CHECK(cudaStreamSynchronize(stream));
        }
        break;
      }
      h_read_count_.fetch_sub(1, std::memory_order_acq_rel);
    }
  }

  void unlock_read(cudaStream_t stream) {
    { group_lock::unlock_read_kernel<<<1, 1, 0, stream>>>(d_read_count_); }
    h_read_count_.fetch_sub(1, std::memory_order_release);
  }

  void lock_update() {
    for (;;) {
      while (h_read_count_.load(std::memory_order_acquire)) {
      }
      h_update_count_.fetch_add(1, std::memory_order_acq_rel);
      if (h_read_count_.load(std::memory_order_acquire) == 0) {
        {
          cudaStream_t stream = get_lock_stream();
          group_lock::lock_update_kernel<<<1, 1, 0, stream>>>(d_update_count_,
                                                              d_read_count_);
          CUDA_CHECK(cudaStreamSynchronize(stream));
        }
        break;
      }
      h_update_count_.fetch_sub(1, std::memory_order_acq_rel);
    }
  }

  void unlock_update(cudaStream_t stream) {
    { group_lock::unlock_update_kernel<<<1, 1, 0, stream>>>(d_update_count_); }
    h_update_count_.fetch_sub(1, std::memory_order_release);
  }

  void lock_update_read() {
    /* Lock unique flag */
    bool expected = false;
    while (!h_unique_flag_.compare_exchange_weak(expected, true,
                                                 std::memory_order_acq_rel)) {
      expected = false;
    }

    /* Ban update */
    for (;;) {
      while (h_update_count_.load(std::memory_order_acquire)) {
      }
      h_read_count_.fetch_add(1, std::memory_order_acq_rel);
      if (h_update_count_.load(std::memory_order_acquire) == 0) {
        break;
      }
      h_read_count_.fetch_sub(1, std::memory_order_acq_rel);
    }

    /* Ban read */
    for (;;) {
      while (h_read_count_.load(std::memory_order_acquire) > 1) {
      }
      h_update_count_.fetch_add(1, std::memory_order_acq_rel);
      if (h_read_count_.load(std::memory_order_acquire) == 1) {
        break;
      }
      h_update_count_.fetch_sub(1, std::memory_order_acq_rel);
    }

    {
      cudaStream_t stream = get_lock_stream();
      group_lock::lock_update_read_kernel<<<1, 1, 0, stream>>>(
          d_update_count_, d_read_count_, d_unique_flag_);
      CUDA_CHECK(cudaStreamSynchronize(stream));
    }
  }

  void unlock_update_read(cudaStream_t stream) {
    {
      group_lock::unlock_update_read_kernel<<<1, 1, 0, stream>>>(
          d_update_count_, d_read_count_, d_unique_flag_);
    }
    h_read_count_.fetch_sub(1, std::memory_order_release);
    h_update_count_.fetch_sub(1, std::memory_order_release);
    h_unique_flag_.store(false, std::memory_order_release);
  }

  int update_count() noexcept {
    int count = 0;
    int* d_count;
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    CUDA_CHECK(cudaMalloc(&d_count, sizeof(int)));
    group_lock::update_count_kernel<<<1, 1, 0, stream>>>(d_count,
                                                         d_update_count_);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(&count, d_count, sizeof(int), cudaMemcpyDefault));
    CUDA_CHECK(cudaFree(d_count));
    CUDA_CHECK(cudaStreamDestroy(stream));
    return count;
  }

  int read_count() noexcept {
    int count = 0;
    int* d_count;
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));
    CUDA_CHECK(cudaMalloc(&d_count, sizeof(int)));
    group_lock::read_count_kernel<<<1, 1, 0, stream>>>(d_count, d_read_count_);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(&count, d_count, sizeof(int), cudaMemcpyDefault));
    CUDA_CHECK(cudaFree(d_count));
    CUDA_CHECK(cudaStreamDestroy(stream));
    return count;
  }

 private:
  std::atomic<int> h_update_count_;
  std::atomic<int> h_read_count_;
  std::atomic<bool> h_unique_flag_;

  cuda::atomic<int, cuda::thread_scope_device>* d_update_count_;
  cuda::atomic<int, cuda::thread_scope_device>* d_read_count_;
  cuda::atomic<bool, cuda::thread_scope_device>* d_unique_flag_;
};

class read_shared_lock {
 public:
  read_shared_lock(const read_shared_lock&) = delete;
  read_shared_lock(read_shared_lock&&) = delete;

  read_shared_lock& operator=(const read_shared_lock&) = delete;
  read_shared_lock& operator=(read_shared_lock&&) = delete;

  explicit read_shared_lock(group_shared_mutex& mutex, cudaStream_t stream = 0)
      : mutex_(&mutex) {
    mutex_->lock_read();
    owns_ = true;
    stream_ = stream;
  }

  explicit read_shared_lock(group_shared_mutex& mutex, std::defer_lock_t,
                            cudaStream_t stream = 0)
      : mutex_(&mutex), stream_(stream), owns_(false) {}

  ~read_shared_lock() {
    if (owns_) {
      mutex_->unlock_read(stream_);
    }
  }

  void lock() noexcept {
    if (!owns_) {
      mutex_->lock_read();
      owns_ = true;
    }
  }

  bool owns_lock() const noexcept { return owns_; }

 private:
  group_shared_mutex* const mutex_;
  bool owns_;
  cudaStream_t stream_;
};

class update_shared_lock {
 public:
  update_shared_lock(const update_shared_lock&) = delete;
  update_shared_lock(update_shared_lock&&) = delete;

  update_shared_lock& operator=(const update_shared_lock&) = delete;
  update_shared_lock& operator=(update_shared_lock&&) = delete;

  explicit update_shared_lock(group_shared_mutex& mutex,
                              cudaStream_t stream = 0)
      : mutex_(&mutex) {
    mutex_->lock_update();
    owns_ = true;
    stream_ = stream;
  }

  explicit update_shared_lock(group_shared_mutex& mutex, std::defer_lock_t,
                              cudaStream_t stream = 0)
      : mutex_(&mutex), stream_(stream), owns_(false) {}

  ~update_shared_lock() {
    if (owns_) {
      mutex_->unlock_update(stream_);
    }
  }

  void lock() noexcept {
    if (!owns_) {
      mutex_->lock_update();
      owns_ = true;
    }
  }

  bool owns_lock() const noexcept { return owns_; }

 private:
  group_shared_mutex* const mutex_;
  bool owns_;
  cudaStream_t stream_;
};

class update_read_lock {
 public:
  update_read_lock(const update_read_lock&) = delete;
  update_read_lock(update_read_lock&&) = delete;

  update_read_lock& operator=(const update_read_lock&) = delete;
  update_read_lock& operator=(update_read_lock&&) = delete;

  explicit update_read_lock(group_shared_mutex& mutex, cudaStream_t stream = 0)
      : mutex_(&mutex) {
    mutex_->lock_update_read();
    owns_ = true;
    stream_ = stream;
  }

  explicit update_read_lock(group_shared_mutex& mutex, std::defer_lock_t,
                            cudaStream_t stream = 0) noexcept
      : mutex_(&mutex), stream_(stream), owns_(false) {}

  ~update_read_lock() {
    if (owns_) {
      mutex_->unlock_update_read(stream_);
    }
  }

  void lock() {
    assert(!owns_ && "[update_read_lock] trying to lock twice!");
    mutex_->lock_update_read();
    owns_ = true;
  }

  bool owns_lock() const noexcept { return owns_; }

 private:
  group_shared_mutex* const mutex_;
  bool owns_;
  cudaStream_t stream_;
};

using insert_unique_lock = update_read_lock;

}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/memory_pool.cuh
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <algorithm>
#include <array>
#include <functional>
#include <iostream>
#include <memory>
#include <mutex>
#include <thread>
#include <vector>
#include "allocator.cuh"
#include "debug.hpp"

namespace nv {
namespace merlin {

/**
 * Allocators are used by the memory pool (and maybe other classes) to create
 * RAII complient containers for buffers allocated in different memory areas.
 */
template <class T, class Allocator>
struct AllocatorBase {
  using type = T;
  using sync_unique_ptr = std::unique_ptr<type, std::function<void(type*)>>;
  using async_unique_ptr = std::unique_ptr<type, std::function<void(type*)>>;
  using shared_ptr = std::shared_ptr<type>;

  inline static sync_unique_ptr make_unique(size_t n,
                                            BaseAllocator* allocator) {
    return {Allocator::alloc(n, allocator),
            [allocator](type* p) { Allocator::free(p, allocator); }};
  }

  inline static async_unique_ptr make_unique(size_t n, BaseAllocator* allocator,
                                             cudaStream_t stream) {
    return {Allocator::alloc(n, allocator, stream),
            [stream, allocator](type* p) { Allocator::free(p, allocator); }};
  }

  inline static shared_ptr make_shared(size_t n, BaseAllocator* allocator,
                                       cudaStream_t stream = 0) {
    return {Allocator::alloc(n, allocator, stream),
            [stream, allocator](type* p) {
              Allocator::free(p, allocator, stream);
            }};
  }
};

/**
 * Trivial fallback implementation using the standard C++ allocator. This mostly
 * exists to ensure interface correctness, and as an illustration of what a
 * proper allocator implementation should look like.
 */
template <class T>
struct StandardAllocator final : AllocatorBase<T, StandardAllocator<T>> {
  using type = typename AllocatorBase<T, StandardAllocator<T>>::type;

  static constexpr const char* name{"StandardAllocator"};

  inline static type* alloc(size_t n, BaseAllocator* allocator,
                            cudaStream_t stream = 0) {
    type* ptr;
    allocator->alloc(MemoryType::Host, (void**)&ptr, n * sizeof(T));
    return ptr;
  }

  inline static void free(type* ptr, BaseAllocator* allocator,
                          cudaStream_t stream = 0) {
    allocator->free(MemoryType::Host, ptr);
  }
};

/**
 * Claim/release buffers in pinned host memory.
 */
template <class T>
struct HostAllocator final : AllocatorBase<T, HostAllocator<T>> {
  using type = typename AllocatorBase<T, HostAllocator<T>>::type;

  static constexpr const char* name{"HostAllocator"};

  inline static type* alloc(size_t n, BaseAllocator* allocator,
                            cudaStream_t stream = 0) {
    void* ptr;
    allocator->alloc(MemoryType::Pinned, (void**)&ptr, n * sizeof(T));
    return reinterpret_cast<type*>(ptr);
  }

  inline static void free(type* ptr, BaseAllocator* allocator,
                          cudaStream_t stream = 0) {
    allocator->free(MemoryType::Pinned, ptr);
  }
};

/**
 * Claim/release buffers in the active CUDA device. Will not test if the correct
 * device was used, and throw if CUDA runtime API response is negative.
 */
template <class T>
struct DeviceAllocator final : AllocatorBase<T, DeviceAllocator<T>> {
  using type = typename AllocatorBase<T, DeviceAllocator<T>>::type;

  static constexpr const char* name{"DeviceAllocator"};

  inline static type* alloc(size_t n, BaseAllocator* allocator,
                            cudaStream_t stream = 0) {
    void* ptr;

    allocator->alloc_async(MemoryType::Device, (void**)&ptr, n * sizeof(T),
                           stream);
    return reinterpret_cast<type*>(ptr);
  }

  inline static void free(type* ptr, BaseAllocator* allocator,
                          cudaStream_t stream = 0) {
    allocator->free_async(MemoryType::Device, ptr, stream);
  }
};

/**
 * Helper structure to configure a memory pool.
 */
struct MemoryPoolOptions {
  size_t max_stock{4};     ///< Amount of buffers to keep in reserve.
  size_t max_pending{16};  ///< Maximum amount of awaitable buffers. If this
                           ///< limit is exceeded threads will start to block.
};

/**
 * Forward declares required to make templated ostream overload work.
 */
template <class Allocator>
class MemoryPool;

template <class Allocator>
std::ostream& operator<<(std::ostream&, const MemoryPool<Allocator>&);

/**
 * CUDA deferred execution aware memory pool implementation. As for every memory
 * pool, the general idea is to have resuable buffers. All buffers have the same
 * size.
 *
 * General behavior:
 *
 * This memory pool implementation attempts to avoid blocking before the fact,
 * but also avoids relying on a background worker.
 *
 * Buffer borrow and return semantics tightly align with C++ RAII principles.
 * That is, if a workspace is requested, any borrowed buffers will be returned
 * automatically when leaving the scope.
 *
 * You can either borrow a single buffer, or a workspace (that is multiple
 * buffers). We support dynamic and static workspaces. Static workspaces have
 * the benefit that they will never require heap memory (no hidden allocations).
 *
 *
 * Buffer borrowing:
 *
 * If buffers are requested, we take them from the stock, if available. If the
 * stock is depleted, we check if any pending buffer has been used up by the GPU
 * and adds them to the stock. If was also not successful, we allocate a new
 * buffer. Buffers or workspaces (groups of buffers).
 *
 * When borrowing a buffer a streaming context can be specified. This context is
 * relevant for allocation and during returns. It is assumed that the stream you
 * provide as context will be the stream where you queue the workload. Not doing
 * so may lead to undefined behavior.
 *
 * Buffer return:
 *
 * If no context is provided, we cannot make any assumptions regarding the usage
 * one the device. So we sychronize the device first and then return the buffer
 * to the stock. If a streaming context was provided, we queue an event and add
 * the buffer to the `pending` pool. That means, the buffer has been
 * reqlinquished by the CPU, but may still be used by the GPU. If no pending
 * slot is available, we probe the currently pending buffers events for
 * completion. Completed pending buffers are returned to the reserve. If so, we
 * queue the buffer in the freed slot. If that was unsucessful (i.e., all
 * currently pending buffers are still in use by the GPU), we have no choice but
 * the free the buffer using the current stream.
 *
 * In either case, `max_reserve` represents the maxmum size of the stock. If
 * returning a buffer would lead to the stock exeeding this quantity, the buffer
 * is queued for destruction.
 */
template <class Allocator>
class MemoryPool final {
 public:
  using pool_type = MemoryPool<Allocator>;
  using alloc_type = typename Allocator::type;
  template <class Container>
  class Workspace {
   public:
    inline Workspace() : pool_{nullptr}, buffer_size_{0}, stream_{0} {}

    inline Workspace(pool_type* pool, cudaStream_t stream)
        : pool_{pool}, buffer_size_{0}, stream_{stream} {}

    Workspace(const Workspace&) = delete;

    Workspace& operator=(const Workspace&) = delete;

    inline Workspace(Workspace&& other)
        : pool_{other.pool_},
          buffer_size_{other.buffer_size_},
          stream_{other.stream_},
          buffers_{std::move(other.buffers_)} {}

    inline Workspace& operator=(Workspace&& other) {
      if (pool_) {
        pool_->put_raw(buffers_.begin(), buffers_.end(), buffer_size_, stream_);
      }
      pool_ = other.pool_;
      buffer_size_ = other.buffer_size_;
      stream_ = other.stream_;
      buffers_ = std::move(other.buffers_);
      other.pool_ = nullptr;
      return *this;
    }

    inline ~Workspace() {
      if (pool_) {
        pool_->put_raw(buffers_.begin(), buffers_.end(), buffer_size_, stream_);
      }
    }

    template <class T>
    constexpr void at(const size_t n, T* ptr) const {
      *ptr = at<T>(n);
    }

    template <class T>
    constexpr T at(const size_t n) const {
      return reinterpret_cast<T>(buffers_.at(n));
    }

    template <class T>
    constexpr void get(const size_t n, T* ptr) const {
      *ptr = get<T>(n);
    }

    template <class T>
    constexpr T get(const size_t n) const {
      return reinterpret_cast<T>(buffers_[n]);
    }

    constexpr alloc_type* operator[](const size_t n) const {
      return buffers_[n];
    }

   protected:
    pool_type* pool_;
    size_t buffer_size_;
    cudaStream_t stream_;
    Container buffers_;
  };

  template <size_t N>
  class StaticWorkspace final : public Workspace<std::array<alloc_type*, N>> {
   public:
    using base_type = Workspace<std::array<alloc_type*, N>>;

    friend class MemoryPool<Allocator>;

    inline StaticWorkspace() : base_type() {}

    StaticWorkspace(const StaticWorkspace&) = delete;

    StaticWorkspace& operator=(const StaticWorkspace&) = delete;

    inline StaticWorkspace(StaticWorkspace&& other)
        : base_type(std::move(other)) {}

    inline StaticWorkspace& operator=(StaticWorkspace&& other) {
      base_type::operator=(std::move(other));
      return *this;
    }

   private:
    inline StaticWorkspace(pool_type* pool, size_t requested_buffer_size,
                           cudaStream_t stream)
        : base_type(pool, stream) {
      auto& buffers = this->buffers_;
      this->buffer_size_ = pool->get_raw(buffers.begin(), buffers.end(),
                                         requested_buffer_size, stream);
    }
  };

  class DynamicWorkspace final : public Workspace<std::vector<alloc_type*>> {
   public:
    using base_type = Workspace<std::vector<alloc_type*>>;

    friend class MemoryPool<Allocator>;

    inline DynamicWorkspace() : base_type() {}

    DynamicWorkspace(const DynamicWorkspace&) = delete;

    DynamicWorkspace& operator=(const DynamicWorkspace&) = delete;

    inline DynamicWorkspace(DynamicWorkspace&& other)
        : base_type(std::move(other)) {}

    inline DynamicWorkspace& operator=(DynamicWorkspace&& other) {
      base_type::operator=(std::move(other));
      return *this;
    }

   private:
    inline DynamicWorkspace(pool_type* pool, size_t n,
                            size_t requested_buffer_size, cudaStream_t stream)
        : base_type(pool, stream) {
      auto& buffers = this->buffers_;
      buffers.resize(n);
      this->buffer_size_ = pool->get_raw(buffers.begin(), buffers.end(),
                                         requested_buffer_size, stream);
    }
  };

  MemoryPool(const MemoryPoolOptions& options, BaseAllocator* allocator)
      : options_{options}, allocator_{allocator} {
    // Create initial buffer stock.
    stock_.reserve(options_.max_stock);

    // Create enough events, so we have one per potentially pending buffer.
    ready_events_.resize(options_.max_pending);
    for (auto& ready_event : ready_events_) {
      CUDA_CHECK(cudaEventCreate(&ready_event));
    }

    // Preallocate pending.
    pending_.reserve(options_.max_pending);
  }

  ~MemoryPool() {
    // Make sure all queued tasks are complete.
    await_pending();

    // Free event and buffer memory.
    for (auto& ready_event : ready_events_) {
      CUDA_CHECK(cudaEventDestroy(ready_event));
    }

    // Any remaining buffers need to be properly unallocated.
    deplete_stock();
  }

  inline size_t buffer_size() const { return buffer_size_; }

  inline size_t max_batch_size(size_t max_item_size) const {
    return buffer_size_ / max_item_size;
  }

  template <class T>
  inline size_t max_batch_size() const {
    return max_batch_size(sizeof(T));
  }

  size_t current_stock() const {
    std::lock_guard<std::mutex> lock(mutex_);
    return stock_.size();
  }

  size_t num_pending() const {
    std::lock_guard<std::mutex> lock(mutex_);
    return pending_.size();
  }

  void await_pending(cudaStream_t stream = 0) {
    std::lock_guard<std::mutex> lock(mutex_);
    while (!pending_.empty()) {
      collect_pending_unsafe(stream);
      if (pending_.empty()) {
        break;
      }
      std::this_thread::yield();
    }
  }

  void deplete_stock() {
    std::lock_guard<std::mutex> lock(mutex_);
    for (auto& ptr : stock_) {
      Allocator::free(ptr, allocator_);
    }
    stock_.clear();
  }

  inline std::unique_ptr<alloc_type, std::function<void(alloc_type*)>>
  get_unique(size_t requested_buffer_size, cudaStream_t stream = 0) {
    alloc_type* ptr;
    const size_t allocation_size =
        get_raw(&ptr, (&ptr) + 1, requested_buffer_size, stream);
    return {ptr, [this, allocation_size, stream](alloc_type* p) {
              put_raw(&p, (&p) + 1, allocation_size, stream);
            }};
  }

  inline std::shared_ptr<alloc_type> get_shared(size_t requested_buffer_size,
                                                cudaStream_t stream = 0) {
    alloc_type* ptr;
    const size_t allocation_size =
        get_raw(&ptr, (&ptr) + 1, requested_buffer_size, stream);
    return {ptr, [this, allocation_size, stream](alloc_type* p) {
              put_raw(&p, (&p) + 1, allocation_size, stream);
            }};
  }

  template <size_t N>
  inline StaticWorkspace<N> get_workspace(size_t requested_buffer_size,
                                          cudaStream_t stream = 0) {
    return {this, requested_buffer_size, stream};
  }

  inline DynamicWorkspace get_workspace(size_t n, size_t requested_buffer_size,
                                        cudaStream_t stream = 0) {
    return {this, n, requested_buffer_size, stream};
  }

  friend std::ostream& operator<< <Allocator>(std::ostream&, const MemoryPool&);

 private:
  inline void collect_pending_unsafe(cudaStream_t stream) {
    auto it{std::remove_if(
        pending_.begin(), pending_.end(), [this, stream](const auto& pending) {
          const cudaError_t state{cudaEventQuery(std::get<2>(pending))};
          switch (state) {
            case cudaSuccess:
              // Stock buffers and destroy those that are no
              // longer needed, but only if the allocation_size
              // is still the same as the current buffer_size.
              if (stock_.size() < options_.max_stock &&
                  std::get<1>(pending) == buffer_size_) {
                stock_.emplace_back(std::get<0>(pending));
              } else {
                Allocator::free(std::get<0>(pending), allocator_, stream);
              }
              ready_events_.emplace_back(std::get<2>(pending));
              return true;
            case cudaErrorNotReady:
              return false;
            default:
              CUDA_CHECK(state);
              return false;
          }
        })};
    pending_.erase(it, pending_.end());
  }

  inline void clear_stock_unsafe(cudaStream_t stream) {
    for (auto& ptr : stock_) {
      Allocator::free(ptr, allocator_, stream);
    }
    stock_.clear();
  }

  template <class Iterator>
  inline size_t get_raw(Iterator first, Iterator const last,
                        size_t requested_buffer_size, cudaStream_t stream) {
    // Get pre-allocated buffers if stock available.
    size_t allocation_size;
    {
      std::lock_guard<std::mutex> lock(mutex_);

      // If requested_buffer_size is within current buffer_size margins can
      // reuse current buffers.
      if (requested_buffer_size <= buffer_size_) {
        while (first != last) {
          // If no buffers available, try to make some available.
          if (stock_.empty()) {
            collect_pending_unsafe(stream);
            if (stock_.empty()) {
              // No buffers available.
              break;
            }
          }

          // Just take the next available buffer.
          *first++ = stock_.back();
          stock_.pop_back();
        }
      } else {
        // Drop the stock because we need more memory and those buffers have
        // become useless to that end.
        clear_stock_unsafe(stream);
        buffer_size_ = requested_buffer_size;
      }

      allocation_size = buffer_size_;
    }

    // Forge new buffers until request can be filled.
    for (; first != last; ++first) {
      *first = Allocator::alloc(allocation_size, allocator_, stream);
    }

    return allocation_size;
  }

  template <class Iterator>
  inline void put_raw(Iterator first, Iterator const last,
                      size_t allocation_size, cudaStream_t stream) {
    std::lock_guard<std::mutex> lock(mutex_);

    // If allocation_size of the workspace differs from the current buffer_size
    // (i.e., somebody else requested a larger buffer since the original request
    // occured), the provided buffers are incompatible and have to be discarded.
    if (allocation_size != buffer_size_) {
      while (first != last) {
        Allocator::free(*first++, allocator_);
      }
      return;
    }

    // If the workspace that borrowed a stream was moved out of the RAII scope
    // where it was created, it could happen that the stream was destroyed when
    // we return the buffer ownership. This will prevent that.
    //
    // Note that `cudaStreamQuery` isn't designed to track stream destruction.
    // This check is a last resort, and may not work reliably. The recommended
    // best practice is to simply ensure streams you use are alive and well.
    if (cudaStreamQuery(stream) != cudaErrorInvalidResourceHandle) {
      for (; first != last; ++first) {
        // Avoid adding already deallocated buffers.
        if (*first == nullptr) {
          continue;
        }

        // Spin lock if too many pending buffers (i.e., let CPU wait for GPU).
        while (ready_events_.empty()) {
          collect_pending_unsafe(stream);
          if (!ready_events_.empty()) {
            break;
          }
          std::this_thread::yield();
        }

        // Queue buffer.
        cudaEvent_t ready_event{ready_events_.back()};
        ready_events_.pop_back();
        CUDA_CHECK(cudaEventRecord(ready_event, stream));
        pending_.emplace_back(*first, allocation_size, ready_event);
      }
    } else {
      // Without stream context, we must force a hard sync with the GPU.
      CUDA_CHECK(cudaDeviceSynchronize());

      for (; first != last; ++first) {
        // Avoid adding already deallocated buffers.
        if (*first == nullptr) {
          continue;
        }

        // Stock buffers and destroy those that are no longer needed.
        if (stock_.size() < options_.max_stock) {
          stock_.emplace_back(*first);
        } else {
          Allocator::free(*first, allocator_);
        }
      }
    }
  }

  const MemoryPoolOptions options_;

  mutable std::mutex mutex_;
  size_t buffer_size_{1};
  std::vector<alloc_type*> stock_;
  std::vector<cudaEvent_t> ready_events_;

  std::vector<std::tuple<alloc_type*, size_t, cudaEvent_t>> pending_;
  BaseAllocator* allocator_;
};

template <class Allocator>
std::ostream& operator<<(std::ostream& os, const MemoryPool<Allocator>& pool) {
  std::lock_guard<std::mutex> lock(pool.mutex_);

  for (size_t i{0}; i < 80; ++i) {
    os << '-';
  }

  // Current stock.
  os << "\nStock =\n";
  for (size_t i{0}; i < pool.stock_.size(); ++i) {
    os << "[ " << i << " ] buffer " << static_cast<void*>(pool.stock_[i])
       << ", size = " << pool.buffer_size_ << '\n';
  }

  // Pending buffers.
  os << "\nPending =\n";
  for (size_t i{0}; i < pool.pending_.size(); ++i) {
    os << "[ " << i
       << " ] buffer = " << static_cast<void*>(std::get<0>(pool.pending_[i]))
       << ", size = " << std::get<1>(pool.pending_[i]) << ", ready_event = "
       << static_cast<void*>(std::get<2>(pool.pending_[i])) << '\n';
  }

  // Available ready events.
  os << "\nReady Events =\n";
  for (size_t i{0}; i < pool.ready_events_.size(); ++i) {
    os << "[ " << i << " ] " << static_cast<void*>(pool.ready_events_[i])
       << '\n';
  }

  for (size_t i{0}; i < 80; ++i) {
    os << '-';
  }

  os << '\n';
  return os;
}

}  // namespace merlin
}  // namespace nv


================================================
FILE: include/merlin/multi_vector.hpp
================================================
/*
 * Copyright (c) 2025, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <array>
#include <cstddef>
#include <cstdint>
#include <initializer_list>
#include <tuple>
#include <type_traits>

namespace nv {
namespace merlin {

/*
MultiVector supports:

1.Different types (any T1, T2, ...)

2.Each block of memory is 16-byte aligned

3.The first address of the i-th element can be retrieved using get<i>() (a
pointer of the correct type)

4.The total size of the entire multivector can be obtained

5.Large blocks of memory are allocated at once, with manual internal
partitioning (to improve memory locality)
*/
template <typename... Ts>
class MultiVector {
 public:
  static constexpr size_t Alignment = 16;

  template <typename... Lens, typename = typename std::enable_if<
                                  sizeof...(Lens) == sizeof...(Ts)>::type>
  explicit MultiVector(Lens... lens) {
    size_t tmp[] = {static_cast<size_t>(lens)...};
    for (size_t i = 0; i < sizeof...(Ts); ++i) {
      lengths_[i] = tmp[i];
    }
    compute_offsets();
  }

  ~MultiVector() {}

  template <size_t I>
  auto get(uint8_t* data) {
    using T = typename std::tuple_element<I, std::tuple<Ts...>>::type;
    return reinterpret_cast<T*>(data + offsets_[I]);
  }

  size_t length(size_t idx) const { return lengths_[idx]; }

  size_t offset(size_t idx) const { return offsets_[idx]; }

  size_t total_size() const { return total_size_; }

 private:
  std::array<size_t, sizeof...(Ts)> lengths_{};
  std::array<size_t, sizeof...(Ts)> offsets_{};
  size_t total_size_{0};

  constexpr size_t align_up(size_t n, size_t alignment) {
    return (n + alignment - 1) / alignment * alignment;
  }

  void compute_offsets() {
    size_t offset = 0;
    size_t idx = 0;

    (void)std::initializer_list<int>{
        (offset = align_up(offset, Alignment), offsets_[idx] = offset,
         offset += lengths_[idx] * sizeof(Ts), ++idx, 0)...};

    total_size_ = align_up(offset, Alignment);
  }
};

template <size_t I, typename... Ts>
auto get_vector(MultiVector<Ts...>& mv, uint8_t* data) {
  return mv.template get<I>(data);
}

}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/optimizers.cuh
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <cuda_runtime.h>
#include "types.cuh"
#include "utils.cuh"

namespace nv {
namespace merlin {
namespace optimizers {

template <class T>
__global__ void adam_update_kernel(int len, float* weight, T* m, T* v,
                                   const T* wgrad, float alpha_t, float beta1,
                                   float beta2, float epsilon, float scaler) {
  const int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < len) {
    float gi = TypeConvertFunc<float, T>::convert(wgrad[i]) / scaler;
    float mi =
        beta1 * TypeConvertFunc<float, T>::convert(m[i]) + (1.f - beta1) * gi;
    float vi = beta2 * TypeConvertFunc<float, T>::convert(v[i]) +
               (1.f - beta2) * gi * gi;
    m[i] = TypeConvertFunc<T, float>::convert(mi);
    v[i] = TypeConvertFunc<T, float>::convert(vi);
    weight[i] -= alpha_t * mi / (sqrt(vi) + epsilon);
  }
}

template <class T>
__global__ void ada_grad_update_kernel(int len, float* weight, const T* wgrad,
                                       T* sum, float lr, const float epsilon,
                                       float scaler) {
  const int i = blockIdx.x * blockDim.x + threadIdx.x;
  if (i < len) {
    float gi = TypeConvertFunc<float, T>::convert(wgrad[i]) / scaler;
    float accum_ = TypeConvertFunc<float, T>::convert(__ldg(&sum[i]));
    accum_ += gi * gi;
    float std_ = epsilon + sqrtf(accum_);
    weight[i] -= lr * gi / std_;
    sum[i] = TypeConvertFunc<T, float>::convert(accum_);
  }
}

template <class T>
__global__ void momentum_sgd_update_kernel(int len, float* weight, T* momentum,
                                           const T* wgrad, float lr,
                                           float momentum_factor,
                                           float scaler) {
  int idx = blockDim.x * blockIdx.x + threadIdx.x;
  if (idx < len) {
    float mv =
        momentum_factor * TypeConvertFunc<float, T>::convert(momentum[idx]) -
        lr * TypeConvertFunc<float, T>::convert(wgrad[idx]) / scaler;
    momentum[idx] = TypeConvertFunc<T, float>::convert(mv);
    weight[idx] += mv;
  }
  return;
}

}  // namespace optimizers
}  // namespace merlin
}  // namespace nv

================================================
FILE: include/merlin/types.cuh
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <stddef.h>
#include <cstdint>
#include <cuda/atomic>
#include <cuda/std/semaphore>
#include "debug.hpp"

namespace nv {
namespace merlin {

/**
 * Shorthand for a Key-Value-score tuple.
 */
template <class K, class V, class S>
struct KVM {
  K key;
  V* value;
  S score;
};

// Storage size.
using byte16 = uint4;
using byte8 = uint2;
using byte4 = uint32_t;
using byte2 = uint16_t;
using byte = uint8_t;

// Digest.
using D = byte;
constexpr uint64_t DEFAULT_EMPTY_KEY = UINT64_C(0xFFFFFFFFFFFFFFFF);
constexpr uint64_t DEFAULT_RECLAIM_KEY = UINT64_C(0xFFFFFFFFFFFFFFFE);
constexpr uint64_t DEFAULT_LOCKED_KEY = UINT64_C(0xFFFFFFFFFFFFFFFD);

constexpr uint64_t DEFAULT_RESERVED_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFC);
constexpr uint64_t DEFAULT_VACANT_KEY_MASK = UINT64_C(0xFFFFFFFFFFFFFFFE);

constexpr uint64_t MAX_SCORE = UINT64_C(0xFFFFFFFFFFFFFFFF);
constexpr uint64_t EMPTY_SCORE = UINT64_C(0);
constexpr uint64_t IGNORED_GLOBAL_EPOCH = UINT64_C(0xFFFFFFFFFFFFFFFF);

static uint64_t EMPTY_KEY_CPU = DEFAULT_EMPTY_KEY;
__constant__ uint64_t EMPTY_KEY = DEFAULT_EMPTY_KEY;
__constant__ uint64_t RECLAIM_KEY = DEFAULT_RECLAIM_KEY;
__constant__ uint64_t LOCKED_KEY = DEFAULT_LOCKED_KEY;

__constant__ uint64_t RESERVED_KEY_MASK_1 = DEFAULT_RESERVED_KEY_MASK;
__constant__ uint64_t RESERVED_KEY_MASK_2 = DEFAULT_RESERVED_KEY_MASK;
__constant__ uint64_t VACANT_KEY_MASK_1 = DEFAULT_VACANT_KEY_MASK;
__constant__ uint64_t VACANT_KEY_MASK_2 = DEFAULT_VACANT_KEY_MASK;

constexpr int MAX_RESERVED_KEY_BIT = 62;

template <class K>
__forceinline__ __device__ bool IS_RESERVED_KEY(K key) {
  return (RESERVED_KEY_MASK_1 & key) == RESERVED_KEY_MASK_2;
}

template <class K>
__forceinline__ __device__ bool IS_VACANT_KEY(K key) {
  return (VACANT_KEY_MASK_1 & key) == VACANT_KEY_MASK_2;
}

static cudaError_t init_reserved_keys(int index) {
  if (index < 1 || index > MAX_RESERVED_KEY_BIT) {
    // index = 0 is the default,
    // index = 62 is the maximum index can be set for reserved keys.
    return cudaSuccess;
  }
  uint64_t reservedKeyMask1 = ~(UINT64_C(3) << index);
  uint64_t reservedKeyMask2 = reservedKeyMask1 & ~UINT64_C(1);
  uint64_t vacantKeyMask1 = ~(UINT64_C(1) << index);
  uint64_t vacantKeyMask2 = vacantKeyMask1 & ~UINT64_C(1);

  uint64_t emptyKey = reservedKeyMask2 | (UINT64_C(3) << index);
  uint64_t reclaimKey = vacantKeyMask2;
  uint64_t lockedKey = emptyKey & ~(UINT64_C(2) << index);
  EMPTY_KEY_CPU = emptyKey;

  CUDA_CHECK(cudaMemcpyToSymbol(EMPTY_KEY, &emptyKey, sizeof(uint64_t)));
  CUDA_CHECK(cudaMemcpyToSymbol(RECLAIM_KEY, &reclaimKey, sizeof(uint64_t)));
  CUDA_CHECK(cudaMemcpyToSymbol(LOCKED_KEY, &lockedKey, sizeof(uint64_t)));

  CUDA_CHECK(cudaMemcpyToSymbol(RESERVED_KEY_MASK_1, &reservedKeyMask1,
                                sizeof(uint64_t)));
  CUDA_CHECK(cudaMemcpyToSymbol(RESERVED_KEY_MASK_2, &reservedKeyMask2,
                                sizeof(uint64_t)));
  CUDA_CHECK(
      cudaMemcpyToSymbol(VACANT_KEY_MASK_1, &vacantKeyMask1, sizeof(uint64_t)));
  CUDA_CHECK(
      cudaMemcpyToSymbol(VACANT_KEY_MASK_2, &vacantKeyMask2, sizeof(uint64_t)));
  return cudaGetLastError();
}

template <class K>
using AtomicKey = cuda::atomic<K, cuda::thread_scope_device>;

template <class S>
using AtomicScore = cuda::atomic<S, cuda::thread_scope_device>;

template <class T>
using AtomicPos = cuda::atomic<T, cuda::thread_scope_device>;

template <class K, class V, class S>
struct Bucket {
  AtomicKey<K>* keys_;
  /// TODO: compute the pointer of scores and digests using bucket_max_size
  AtomicScore<S>* scores_;
  /// @brief not visible to users
  D* digests_;
  V* vectors;  // Pinned memory or HBM

  __forceinline__ __device__ D* digests(int index) const {
    return digests_ + index;
  }

  __forceinline__ __device__ AtomicKey<K>* keys(int index) const {
    return keys_ + index;
  }

  __forceinline__ __device__ AtomicScore<S>* scores(int index) const {
    return scores_ + index;
  }

  __forceinline__ __device__ K** keys_addr() {
    return reinterpret_cast<K**>(&keys_);
  }

  static __forceinline__ __device__ AtomicKey<K>* keys(K* keys,
                                                       uint32_t offset) {
    return reinterpret_cast<AtomicKey<K>*>(keys) + offset;
  }

  static __forceinline__ __device__ D* digests(K* keys,
                                               uint32_t bucket_capacity,
                                               uint32_t offset) {
    bucket_capacity = umax(bucket_capacity, 128);
    return reinterpret_cast<D*>(keys) - bucket_capacity + offset;
  }

  static __forceinline__ __device__ S* scores(K* keys, uint32_t bucket_capacity,
                                              uint32_t offset) {
    return reinterpret_cast<S*>(keys + bucket_capacity) + offset;
  }
};

template <cuda::thread_scope Scope, class T = int>
class Lock {
  mutable cuda::atomic<T, Scope> _lock;

 public:
  __device__ Lock() : _lock{1} {}

  template <typename CG>
  __forceinline__ __device__ void acquire(CG const& g,
                                          unsigned long long lane = 0) const {
    if (g.thread_rank() == lane) {
      T expected = 1;
      while (!_lock.compare_exchange_weak(expected, 2,
                                          cuda::std::memory_order_acquire)) {
        expected = 1;
      }
    }
    g.sync();
  }

  template <typename CG>
  __forceinline__ __device__ void release(CG const& g,
                                          unsigned long long lane = 0) const {
    g.sync();
    if (g.thread_rank() == lane) {
      _lock.store(1, cuda::std::memory_order_release);
    }
  }
};

using Mutex = Lock<cuda::thread_scope_device>;

template <class K, class V, class S>
struct Table {
  Bucket<K, V, S>* buckets;
  Mutex* locks;                 // mutex for write buckets
  int* buckets_size;            // size of each buckets.
  V** slices;                   // Handles of the HBM/ HMEM slices.
  size_t dim;                   // Dimension of the `vectors`.
  size_t bytes_per_slice;       // Size by byte of one slice.
  size_t num_of_memory_slices;  // Number of vectors memory slices.
  size_t capacity = 134217728;  // Initial capacity.
  size_t max_size =
      std::numeric_limits<uint64_t>::max();  // Up limit of the table capacity.
  size_t buckets_num;                        // Number of the buckets.
  size_t bucket_max_size = 128;              // Volume of each buckets.
  size_t max_hbm_for_vectors = 0;            // Max HBM allocated for vectors
  size_t remaining_hbm_for_vectors = 0;  // Remaining HBM allocated for vectors
  size_t num_of_buckets_per_alloc = 1;   // Number of buckets allocated in each
                                         // HBM allocation, must be power of 2.
  bool is_pure_hbm = true;               // unused
  bool primary = true;                   // unused
  bool dual_bucket_mode = false;         // Enable dual-bucket addressing
  int slots_offset = 0;                  // unused
  int slots_number = 0;                  // unused
  int device_id = 0;                     // Device id
  int tile_size;
};

template <class K, class S>
using EraseIfPredictInternal =
    bool (*)(const K& key,       ///< iterated key in table
             S& score,           ///< iterated score in table
             const K& pattern,   ///< input key from caller
             const S& threshold  ///< input score from caller
    );

/**
 * An abstract class provides interface between the nv::merlin::HashTable
 * and a file, which enables the table to save to the file or load from
 * the file, by overriding the `read` and `write` method.
 *
 * @tparam K The data type of the key.
 * @tparam V The data type of the vector's elements.
 *         The item data type should be a basic data type of C++/CUDA.
 * @tparam S The data type for `score`.
 *           The currently supported data type is only `uint64_t`.
 *
 */
template <class K, class V, class S>
class BaseKVFile {
 public:
  virtual ~BaseKVFile() {}

  /**
   * Read from file and fill into the keys, values, and scores buffer.
   * When calling save/load method from table, it can assume that the
   * received buffer of keys, vectors, and scores are automatically
   * pre-allocated.
   *
   * @param n The number of KV pairs expect to read. `int64_t` was used
   *          here to adapt to various filesytem and formats.
   * @param dim The dimension of the `vectors`.
   * @param keys The pointer to received buffer for keys.
   * @param vectors The pointer to received buffer for vectors.
   * @param scores The pointer to received buffer for scores.
   *
   * @return Number of KV pairs have been successfully read.
   */
  virtual size_t read(const size_t n, const size_t dim, K* keys, V* vectors,
                      S* scores) = 0;

  /**
   * Write keys, values, scores from table to the file. It defines
   * an abstract method to get batch of KV pairs and write them into
   * file.
   *
   * @param n The number of KV pairs to be written. `int64_t` was used
   *          here to adapt to various filesytem and formats.
   * @param dim The dimension of the `vectors`.
   * @param keys The keys will be written to file.
   * @param vectors The vectors of values will be written to file.
   * @param scores The scores will be written to file.
   *
   * @return Number of KV pairs have been successfully written.
   */
  virtual size_t write(const size_t n, const size_t dim, const K* keys,
                       const V* vectors, const S* scores) = 0;
};

enum class OccupyResult {
  INITIAL,         ///< Initial status
  CONTINUE,        ///< Insert did not succeed, continue trying to insert
  OCCUPIED_EMPTY,  ///< New pair inserted successfully
  OCCUPIED_RECLAIMED,
  DUPLICATE,  ///< Insert did not succeed, key is already present
  EVICT,      ///< Insert succeeded by evicting one key with minimum score.
  REFUSED,    ///< Insert did not succeed, insert score is too low.
  ILLEGAL,    ///< Illegal state, and don't need to do anything.
};

enum class OverrideResult {
  INITIAL,   ///< Initial status
  CONTINUE,  ///< Override did not succeed, continue trying to override
  SUCCESS,   ///< Override successfully
  REFUSED,   ///< Override is refused.
};

struct Sm70 {
  static int const kComputeCapability = 70;
};
struct Sm72 {
  static int const kComputeCapability = 72;
};
struct Sm75 {
  static int const kComputeCapability = 75;
};
struct Sm80 {
  static int const kComputeCapability = 80;
};
struct Sm86 {
  static int const kComputeCapability = 86;
};

struct Sm90 {
  static int const kComputeCapability = 90;
};

/* This struct is mainly for keeping the code readable, it should be strictly
 * consistent with `EvictStrategy::EvictStrategyEnum`.
 */
struct EvictStrategyInternal {
  constexpr static int kLru = 0;         ///< LRU mode.
  constexpr static int kLfu = 1;         ///< LFU mode.
  constexpr static int kEpochLru = 2;    ///< Epoch + LRU mode.
  constexpr static int kEpochLfu = 3;    ///< Epoch + LFU mode.
  constexpr static int kCustomized = 4;  ///< Customized mode.
};

}  // namespace merlin
}  // namespace nv


================================================
FILE: include/merlin/utils.cuh
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <cooperative_groups.h>
#include <stdarg.h>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <exception>
#include <string>
#include "cuda_fp16.h"
#include "debug.hpp"

using namespace cooperative_groups;
namespace cg = cooperative_groups;

__inline__ __device__ uint64_t atomicCAS(uint64_t* address, uint64_t compare,
                                         uint64_t val) {
  return (uint64_t)atomicCAS((unsigned long long*)address,
                             (unsigned long long)compare,
                             (unsigned long long)val);
}

__inline__ __device__ int64_t atomicCAS(int64_t* address, int64_t compare,
                                        int64_t val) {
  return (int64_t)atomicCAS((unsigned long long*)address,
                            (unsigned long long)compare,
                            (unsigned long long)val);
}

__inline__ __device__ uint64_t atomicExch(uint64_t* address, uint64_t val) {
  return (uint64_t)atomicExch((unsigned long long*)address,
                              (unsigned long long)val);
}

__inline__ __device__ int64_t atomicExch(int64_t* address, int64_t val) {
  return (int64_t)atomicExch((unsigned long long*)address,
                             (unsigned long long)val);
}

__inline__ __device__ signed char atomicExch(signed char* address,
                                             signed char val) {
  signed char old = *address;
  *address = val;
  return old;
}

__inline__ __device__ int64_t atomicAdd(int64_t* address, const int64_t val) {
  return (int64_t)atomicAdd((unsigned long long*)address, val);
}

__inline__ __device__ uint64_t atomicAdd(uint64_t* address,
                                         const uint64_t val) {
  return (uint64_t)atomicAdd((unsigned long long*)address, val);
}

namespace nv {
namespace merlin {

template <class S>
static __forceinline__ __device__ S device_nano() {
  S mclk;
  asm volatile("mov.u64 %0,%%globaltimer;" : "=l"(mclk));
  return mclk;
}

inline void __cudaCheckError(const char* file, const int line) {
#ifdef CUDA_ERROR_CHECK
  cudaError err = cudaGetLastError();
  if (cudaSuccess != err) {
    fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n", file, line,
            cudaGetErrorString(err));
    exit(-1);
  }

  // More careful checking. However, this will affect performance.
  // Comment away if needed.
  err = cudaDeviceSynchronize();
  if (cudaSuccess != err) {
    fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n", file,
            line, cudaGetErrorString(err));
    exit(-1);
  }
#endif

  return;
}
#define CudaCheckError() nv::merlin::__cudaCheckError(__FILE__, __LINE__)

static inline size_t SAFE_GET_GRID_SIZE(size_t N, int block_size) {
  return ((N) > std::numeric_limits<int>::max())
             ? (((1 << 30) - 1) / block_size + 1)
             : (((N)-1) / block_size + 1);
}

static inline int SAFE_GET_BLOCK_SIZE(int block_size, int device = -1) {
  cudaDeviceProp prop;
  int current_device = device;
  if (current_device == -1) {
    CUDA_CHECK(cudaGetDevice(&current_device));
  }
  CUDA_CHECK(cudaGetDeviceProperties(&prop, current_device));
  if (block_size > prop.maxThreadsPerBlock) {
    fprintf(stdout,
            "The requested block_size=%d exceeds the device limit, "
            "the maxThreadsPerBlock=%d will be applied.\n",
            block_size, prop.maxThreadsPerBlock);
  }
  return std::min(prop.maxThreadsPerBlock, block_size);
}

inline uint64_t Murmur3HashHost(const uint64_t& key) {
  uint64_t k = key;
  k ^= k >> 33;
  k *= UINT64_C(0xff51afd7ed558ccd);
  k ^= k >> 33;
  k *= UINT64_C(0xc4ceb9fe1a85ec53);
  k ^= k >> 33;
  return k;
}

__inline__ __device__ uint64_t Murmur3HashDevice(uint64_t const& key) {
  uint64_t k = key;
  k ^= k >> 33;
  k *= UINT64_C(0xff51afd7ed558ccd);
  k ^= k >> 33;
  k *= UINT64_C(0xc4ceb9fe1a85ec53);
  k ^= k >> 33;
  return k;
}

__inline__ __device__ int64_t Murmur3HashDevice(int64_t const& key) {
  uint64_t k = uint64_t(key);
  k ^= k >> 33;
  k *= UINT64_C(0xff51afd7ed558ccd);
  k ^= k >> 33;
  k *= UINT64_C(0xc4ceb9fe1a85ec53);
  k ^= k >> 33;
  return int64_t(k);
}

__inline__ __device__ uint32_t Murmur3HashDevice(uint32_t const& key) {
  uint32_t k = key;
  k ^= k >> 16;
  k *= UINT32_C(0x85ebca6b);
  k ^= k >> 13;
  k *= UINT32_C(0xc2b2ae35);
  k ^= k >> 16;

  return k;
}

__inline__ __device__ int32_t Murmur3HashDevice(int32_t const& key) {
  uint32_t k = uint32_t(key);
  k ^= k >> 16;
  k *= UINT32_C(0x85ebca6b);
  k ^= k >> 13;
  k *= UINT32_C(0xc2b2ae35);
  k ^= k >> 16;

  return int32_t(k);
}

class CudaDeviceRestorer {
 public:
  CudaDeviceRestorer() { CUDA_CHECK(cudaGetDevice(&dev_)); }
  ~CudaDeviceRestorer() { CUDA_CHECK(cudaSetDevice(dev_)); }

 private:
  int dev_;
};

static inline int get_dev(const void* ptr) {
  cudaPointerAttributes attr;
  CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
  int dev = -1;

#if CUDART_VERSION >= 10000
  if (attr.type == cudaMemoryTypeDevice)
#else
  if (attr.memoryType == cudaMemoryTypeDevice)
#endif
  {
    dev = attr.device;
  }
  return dev;
}

static inline void switch_to_dev(const void* ptr) {
  int dev = get_dev(ptr);
  if (dev >= 0) {
    CUDA_CHECK(cudaSetDevice(dev));
  }
}

static inline bool is_on_device(const void* ptr) {
  cudaPointerAttributes attr;
  CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));

#if CUDART_VERSION >= 10000
  return (attr.type == cudaMemoryTypeDevice);
#else
  return (attr.memoryType == cudaMemoryTypeDevice);
#endif
}

template <typename TOUT, typename TIN>
struct TypeConvertFunc;

template <>
struct TypeConvertFunc<__half, float> {
  static __forceinline__ __device__ __half convert(float val) {
    return __float2half(val);
  }
};

template <>
struct TypeConvertFunc<float, __half> {
  static __forceinline__ __device__ float convert(__half val) {
    return __half2float(val);
  }
};

template <>
struct TypeConvertFunc<float, float> {
  static __forceinline__ __device__ float convert(float val) { return val; }
};

template <>
struct TypeConvertFunc<float, long long> {
  static __forceinline__ __device__ float convert(long long val) {
    return static_cast<float>(val);
  }
};

template <>
struct TypeConvertFunc<float, unsigned int> {
  static __forceinline__ __device__ float convert(unsigned int val) {
    return static_cast<float>(val);
  }
};

template <>
struct TypeConvertFunc<int, long long> {
  static __forceinline__ __device__ int convert(long long val) {
    return static_cast<int>(val);
  }
};

template <>
struct TypeConvertFunc<int, unsigned int> {
  static __forceinline__ __device__ int convert(unsigned int val) {
    return static_cast<int>(val);
  }
};

template <typename mutex, uint32_t TILE_SIZE, bool THREAD_SAFE = true>
__forceinline__ __device__ void lock(
    const cg::thread_block_tile<TILE_SIZE>& tile, mutex& set_mutex,
    unsigned long long lane = 0) {
  if (THREAD_SAFE) {
    set_mutex.acquire(tile, lane);
  }
}

template <typename mutex, uint32_t TILE_SIZE, bool THREAD_SAFE = true>
__forceinline__ __device__ void unlock(
    const cg::thread_block_tile<TILE_SIZE>& tile, mutex& set_mutex,
    unsigned long long lane = 0) {
  if (THREAD_SAFE) {
    set_mutex.release(tile, lane);
  }
}

inline void free_pointers(cudaStream_t stream, int n, ...) {
  va_list args;
  va_start(args, n);
  void* ptr = nullptr;
  for (int i = 0; i < n; i++) {
    ptr = va_arg(args, void*);
    if (ptr) {
      cudaPointerAttributes attr;
      memset(&attr, 0, sizeof(cudaPointerAttributes));
      try {
        CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
        if (attr.devicePointer && (!attr.hostPointer)) {
          CUDA_CHECK(cudaFreeAsync(ptr, stream));
        } else if (attr.devicePointer && attr.hostPointer) {
          CUDA_CHECK(cudaFreeHost(ptr));
        } else {
          free(ptr);
        }
      } catch (const nv::merlin::CudaException& e) {
        va_end(args);
        throw e;
      }
    }
  }
  va_end(args);
}

static __global__ void memset64bitKernel(void* devPtr, uint64_t value,
                                         size_t count) {
  size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < count) {
    static_cast<uint64_t*>(devPtr)[idx] = value;
  }
}

__forceinline__ __host__ cudaError_t memset64Async(void* devPtr, uint64_t value,
                                                   size_t count,
                                                   cudaStream_t stream = 0) {
  int blockSize = 256;
  int numBlocks = (count + blockSize - 1) / blockSize;
  memset64bitKernel<<<numBlocks, blockSize, 0, stream>>>(devPtr, value, count);
  return cudaGetLastError();
}

#define CUDA_FREE_POINTERS(stream, ...) \
  nv::merlin::free_pointers(            \
      stream, (sizeof((void*[]){__VA_ARGS__}) / sizeof(void*)), __VA_ARGS__);

static inline size_t GB(size_t n) { return n << 30; }

static inline size_t MB(size_t n) { return n << 20; }

static inline size_t KB(size_t n) { return n << 10; }

constexpr inline bool ispow2(unsigned x) { return x && (!(x & (x - 1))); }

}  // namespace merlin
}  // namespace nv


================================================
FILE: include/merlin_hashtable.cuh
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <atomic>
#include <cstdint>
#include <cub/cub.cuh>
#include <iostream>
#include <limits>
#include <memory>
#include <mutex>
#include <shared_mutex>
#include <type_traits>
#include "merlin/allocator.cuh"
#include "merlin/array_kernels.cuh"
#include "merlin/core_kernels.cuh"
#include "merlin/flexible_buffer.cuh"
#include "merlin/group_lock.cuh"
#include "merlin/memory_pool.cuh"
#include "merlin/multi_vector.hpp"
#include "merlin/types.cuh"
#include "merlin/utils.cuh"

namespace nv {
namespace merlin {

/**
 * @brief The eviction strategies.
 *
 * @note The `Score` concept is introduced to define the importance of each key,
 * the larger, the more important, the less likely they will be evicted. On
 * `kLru` mode, the `scores` parameter of the APIs should keep `nullptr`, the
 * score for each key is assigned internally in LRU(Least Recently Used) policy.
 * On `kCustomized` mode, the `scores` should be provided by caller.
 *
 * @note Eviction occurs automatically when a bucket is full. The keys with the
 * minimum `score` value are evicted first.
 *
 * @note on `kLru`, Set the score to the Device clock in a nanosecond, which
 * could differ slightly from the host clock.
 *
 * @note For `kEpochLru` and `kEpochLfu`, the high 32bits would be set to
 * `global_epoch` while the low 32 bits is `timestamp` or `frequency`.
 *
 * @note on `kLfu`, Frequency increment provided by caller via the input
 * parameter of `scores` of `insert-like` APIs as the increment of frequency.
 * when the scores reaches to the max of `uint64_t`, it will not increase any
 * more.
 *
 * @note On `kEpochLru`, the high 32bits is the global epoch provided via the
 * input parameter of `global_epoch`, the low 32bits is equal to `(device_clock
 * >> 20) & 0xffffffff` with granularity close to 1 ms.
 *
 * @note On `kEpochLfu`, the high 32bits is the global epoch provided via the
 * input parameter of `global_epoch`, the low 32bits is the frequency, the
 * frequency will keep constant after reaching the max value of `0xffffffff`.
 *
 * @note On `kCustomized`, fully provided by the caller via the input parameter
 * of `scores` of `insert-like` APIs.
 *
 */
struct EvictStrategy {
  enum EvictStrategyEnum {
    kLru = 0,         ///< LRU mode.
    kLfu = 1,         ///< LFU mode.
    kEpochLru = 2,    ///< Epoch Lru mode.
    kEpochLfu = 3,    ///< Epoch Lfu mode.
    kCustomized = 4,  ///< Customized mode.
  };
};

/**
 * @brief Table operation mode.
 *
 * kThroughput: Default mode, single-bucket addressing, throughput-optimized.
 * kMemory: Dual-bucket addressing, memory-efficiency-optimized (higher LF).
 */
enum class TableMode {
  kThroughput = 0,  ///< Default: single-bucket, max throughput.
  kMemory = 1,      ///< Dual-bucket, higher load factor.
};

/**
 * @brief The options struct of HierarchicalKV.
 */
struct HashTableOptions {
  size_t init_capacity = 0;        ///< The initial capacity of the hash table.
  size_t max_capacity = 0;         ///< The maximum capacity of the hash table.
  size_t max_hbm_for_vectors = 0;  ///< The maximum HBM for vectors, in bytes.
  size_t max_bucket_size = 128;    ///< The length of each bucket.
  size_t dim = 64;                 ///< The dimension of the vectors.
  float max_load_factor = 0.5f;    ///< The max load factor before rehashing.
  int block_size = 128;            ///< The default block size for CUDA kernels.
  int io_block_size = 1024;        ///< The block size for IO CUDA kernels.
  int device_id = -1;              ///< The ID of device.
  bool io_by_cpu = false;  ///< The flag indicating if the CPU handles IO.
  bool use_constant_memory = false;  ///< reserved
  /*
   * reserved_key_start_bit = 0, is the default behavior, HKV reserves
   * `0xFFFFFFFFFFFFFFFD`, `0xFFFFFFFFFFFFFFFE`, and `0xFFFFFFFFFFFFFFFF`  for
   * internal using. if the default one conflicted with your keys, change the
   * reserved_key_start_bit value to a numbers between 1 and 62,
   * reserved_key_start_bit = 1 means using the insignificant bits index 1 and 2
   * as the keys as the reserved keys and the index 0 bit is 0 and all the other
   * bits are 1, the new reserved keys are `FFFFFFFFFFFFFFFE`,
   * `0xFFFFFFFFFFFFFFFC`, `0xFFFFFFFFFFFFFFF8`, and `0xFFFFFFFFFFFFFFFA` the
   * console log prints the reserved keys during the table initialization.
   */
  int reserved_key_start_bit = 0;       ///< The binary index of reserved key.
  size_t num_of_buckets_per_alloc = 1;  ///< Number of buckets allocated in each
                                        ///< HBM allocation, must be power of 2.
  bool api_lock = true;  ///<  The flag indicating whether to lock the table
                         ///<  once enters the API.
  TableMode table_mode = TableMode::kThroughput;  ///< Table operation mode.
  MemoryPoolOptions
      device_memory_pool;  ///< Configuration options for device memory pool.
  MemoryPoolOptions
      host_memory_pool;  ///< Configuration options for host memory pool.
};

/**
 * @brief A customizable template function indicates which keys should be
 * erased from the hash table by returning `true`.
 *
 * @note The `erase_if` or `export_batch_if` API traverses all of the items by
 * this function and the items that return `true` are removed or exported.
 *
 *  Example for erase_if:
 *
 *    ```
 *    template <class K, class S>
 *    struct EraseIfPredFunctor {
 *      __forceinline__ __device__ bool operator()(const K& key,
 *                                                 S& score,
 *                                                 const K& pattern,
 *                                                 const S& threshold) {
 *        return ((key & 0xFFFF000000000000 == pattern) &&
 *                (score < threshold));
 *      }
 *    };
 *    ```
 *
 *  Example for export_batch_if:
 *    ```
 *    template <class K, class S>
 *    struct ExportIfPredFunctor {
 *      __forceinline__ __device__ bool operator()(const K& key,
 *                                                 S& score,
 *                                                 const K& pattern,
 *                                                 const S& threshold) {
 *        return score >= threshold;
 *      }
 *    };
 *    ```
 */
template <class K, class S>
using EraseIfPredict = bool (*)(
    const K& key,       ///< The traversed key in a hash table.
    S& score,           ///< The traversed score in a hash table.
    const K& pattern,   ///< The key pattern to compare with the `key` argument.
    const S& threshold  ///< The threshold to compare with the `score` argument.
);

#if THRUST_VERSION >= 101600
static constexpr auto& thrust_par = thrust::cuda::par_nosync;
#else
static constexpr auto& thrust_par = thrust::cuda::par;
#endif

template <typename K, typename V, typename S = uint64_t>
class HashTableBase {
 public:
  using size_type = size_t;
  using key_type = K;
  using value_type = V;
  using score_type = S;
  using allocator_type = BaseAllocator;

 public:
  virtual ~HashTableBase() {}

  /**
   * @brief Initialize a merlin::HashTable.
   *
   * @param options The configuration options.
   */
  virtual void init(const HashTableOptions& options,
                    allocator_type* allocator = nullptr) = 0;

  /**
   * @brief Insert new key-value-score tuples into the hash table.
   * If the key already exists, the values and scores are assigned new values.
   *
   * If the target bucket is full, the keys with minimum score will be
   * overwritten by new key unless the score of the new key is even less than
   * minimum score of the target bucket.
   *
   * @param n Number of key-value-score tuples to insert or assign.
   * @param keys The keys to insert on GPU-accessible memory with shape
   * (n).
   * @param values The values to insert on GPU-accessible memory with
   * shape (n, DIM).
   * @param scores The scores to insert on GPU-accessible memory with shape
   * (n).
   * @parblock
   * The scores should be a `uint64_t` value for built-in strategies. For
   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.
   * You can specify a value such as the timestamp of the key insertion or
   * number of key occurrences to perform a customized eviction strategy.
   *
   * The @p scores should be `nullptr`, when the LRU eviction strategy is
   * applied.
   * @endparblock
   *
   * @param stream The CUDA stream that is used to execute the operation.
   * @param unique_key If all keys in the same batch are unique.
   *
   * @param ignore_evict_strategy A boolean option indicating whether if
   * the insert_or_assign ignores the evict strategy of table with current
   * scores anyway. If true, it does not check whether the scores conforms to
   * the evict strategy. If false, it requires the scores follow the evict
   * strategy of table.
   */
  virtual void insert_or_assign(const size_type n,
                                const key_type* keys,                // (n)
                                const value_type* values,            // (n, DIM)
                                const score_type* scores = nullptr,  // (n)
                                cudaStream_t stream = 0, bool unique_key = true,
                                bool ignore_evict_strategy = false) = 0;

  /**
   * @brief Insert new key-value-score tuples into the hash table.
   * If the key already exists, the values and scores are assigned new values.
   *
   * If the target bucket is full, the keys with minimum score will be
   * overwritten by new key unless the score of the new key is even less than
   * minimum score of the target bucket. The overwritten key with minimum
   * score will be evicted, with its values and score, to evicted_keys,
   * evicted_values, evcted_scores seperately in compact format.
   *
   * @param n Number of key-value-score tuples to insert or assign.
   * @param keys The keys to insert on GPU-accessible memory with shape
   * (n).
   * @param values The values to insert on GPU-accessible memory with
   * shape (n, DIM).
   * @param scores The scores to insert on GPU-accessible memory with shape
   * (n).
   * @param scores The scores to insert on GPU-accessible memory with shape
   * (n).
   * @params evicted_keys The output of keys replaced with minimum score.
   * @params evicted_values The output of values replaced with minimum score on
   * keys.
   * @params evicted_scores The output of scores replaced with minimum score on
   * keys.
   * @parblock
   * The scores should be a `uint64_t` value for built-in strategies. For
   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.
   * You can specify a value such as the timestamp of the key insertion or
   * number of key occurrences to perform a customized eviction strategy.
   *
   * The @p scores should be `nullptr`, when the LRU eviction strategy is
   * applied.
   * @endparblock
   *
   * @param d_evicted_counter The number of elements evicted on GPU-accessible
   * memory. @notice The caller should guarantee it is set to `0` before
   * calling.
   * @param stream The CUDA stream that is used to execute the operation.
   * @param unique_key If all keys in the same batch are unique.
   *
   * @param ignore_evict_strategy A boolean option indicating whether if
   * the insert_or_assign ignores the evict strategy of table with current
   * scores anyway. If true, it does not check whether the scores confroms to
   * the evict strategy. If false, it requires the scores follow the evict
   * strategy of table.
   */
  virtual void insert_and_evict(const size_type n,
                                const key_type* keys,          // (n)
                                const value_type* values,      // (n, DIM)
                                const score_type* scores,      // (n)
                                key_type* evicted_keys,        // (n)
                                value_type* evicted_values,    // (n, DIM)
                                score_type* evicted_scores,    // (n)
                                size_type* d_evicted_counter,  // (1)
                                cudaStream_t stream = 0, bool unique_key = true,
                                bool ignore_evict_strategy = false) = 0;

  /**
   * @brief Insert new key-value-score tuples into the hash table.
   * If the key already exists, the values and scores are assigned new values.
   *
   * If the target bucket is full, the keys with minimum score will be
   * overwritten by new key unless the score of the new key is even less than
   * minimum score of the target bucket. The overwritten key with minimum
   * score will be evicted, with its values and score, to evicted_keys,
   * evicted_values, evcted_scores seperately in compact format.
   *
   * @param n Number of key-value-score tuples to insert or assign.
   * @param keys The keys to insert on GPU-accessible memory with shape
   * (n).
   * @param values The values to insert on GPU-accessible memory with
   * shape (n, DIM).
   * @param scores The scores to insert on GPU-accessible memory with shape
   * (n).
   * @param scores The scores to insert on GPU-accessible memory with shape
   * (n).
   * @params evicted_keys The output of keys replaced with minimum score.
   * @params evicted_values The output of values replaced with minimum score on
   * keys.
   * @params evicted_scores The output of scores replaced with minimum score on
   * keys.
   * @parblock
   * The scores should be a `uint64_t` value for built-in strategies. For
   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.
   * You can specify a value such as the timestamp of the key insertion or
   * number of key occurrences to perform a customized eviction strategy.
   *
   * The @p scores should be `nullptr`, when the LRU eviction strategy is
   * applied.
   * @endparblock
   *
   * @param stream The CUDA stream that is used to execute the operation.
   * @param unique_key If all keys in the same batch are unique.
   *
   * @param ignore_evict_strategy A boolean option indicating whether if
   * the insert_or_assign ignores the evict strategy of table with current
   * scores anyway. If true, it does not check whether the scores confroms to
   * the evict strategy. If false, it requires the scores follow the evict
   * strategy of table.
   *
   * @return The number of elements evicted.
   */
  virtual size_type insert_and_evict(const size_type n,
                                     const key_type* keys,        // (n)
                                     const value_type* values,    // (n, DIM)
                                     const score_type* scores,    // (n)
                                     key_type* evicted_keys,      // (n)
                                     value_type* evicted_values,  // (n, DIM)
                                     score_type* evicted_scores,  // (n)
                                     cudaStream_t stream = 0,
                                     bool unique_key = true,
                                     bool ignore_evict_strategy = false) = 0;

  /**
   * Searches for each key in @p keys in the hash table.
   * If the key is found and the corresponding value in @p accum_or_assigns is
   * `true`, the @p vectors_or_deltas is treated as a delta to the old
   * value, and the delta is added to the old value of the key.
   *
   * If the key is not found and the corresponding value in @p accum_or_assigns
   * is `false`, the @p vectors_or_deltas is treated as a new value and the
   * key-value pair is updated in the table directly.
   *
   * @note When the key is found and the value of @p accum_or_assigns is
   * `false`, or when the key is not found and the value of @p accum_or_assigns
   * is `true`, nothing is changed and this operation is ignored.
   * The algorithm assumes these situations occur while the key was modified or
   * removed by other processes just now.
   *
   * @param n The number of key-value-score tuples to process.
   * @param keys The keys to insert on GPU-accessible memory with shape (n).
   * @param value_or_deltas The values or deltas to insert on GPU-accessible
   * memory with shape (n, DIM).
   * @param accum_or_assigns The operation type with shape (n). A value of
   * `true` indicates to accum and `false` indicates to assign.
   * @param scores The scores to insert on GPU-accessible memory with shape (n).
   * @parblock
   * The scores should be a `uint64_t` value for built-in strategies. For
   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.
   * You can specify a value such as the timestamp of the key insertion or
   * number of key occurrences to perform a customized eviction strategy.
   *
   * The @p scores should be `nullptr`, when the LRU eviction strategy is
   * applied.
   * @endparblock
   *
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @param ignore_evict_strategy A boolean option indicating whether if
   * the accum_or_assign ignores the evict strategy of table with current
   * scores anyway. If true, it does not check whether the scores confroms to
   * the evict strategy. If false, it requires the scores follow the evict
   * strategy of table.
   */
  virtual void accum_or_assign(const size_type n,
                               const key_type* keys,                // (n)
                               const value_type* value_or_deltas,   // (n, DIM)
                               const bool* accum_or_assigns,        // (n)
                               const score_type* scores = nullptr,  // (n)
                               cudaStream_t stream = 0,
                               bool ignore_evict_strategy = false) = 0;

  /**
   * @brief Searches the hash table for the specified keys.
   * When a key is missing, the value in @p values and @p scores will be
   * inserted.
   *
   * @param n The number of key-value-score tuples to search or insert.
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param values The values to search on GPU-accessible memory with
   * shape (n, DIM).
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   * @param stream The CUDA stream that is used to execute the operation.
   * @param unique_key If all keys in the same batch are unique.
   *
   */
  virtual void find_or_insert(const size_type n, const key_type* keys,  // (n)
                              value_type* values,            // (n * DIM)
                              score_type* scores = nullptr,  // (n)
                              cudaStream_t stream = 0, bool unique_key = true,
                              bool ignore_evict_strategy = false) = 0;

  /**
   * @brief Searches the hash table for the specified keys and returns address
   * of the values. When a key is missing, the value in @p values and @p scores
   * will be inserted.
   *
   * @warning This API returns internal addresses for high-performance but
   * thread-unsafe. The caller is responsible for guaranteeing data consistency.
   *
   * @param n The number of key-value-score tuples to search or insert.
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param values  The addresses of values to search on GPU-accessible memory
   * with shape (n).
   * @param founds The status that indicates if the keys are found on
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   * @param stream The CUDA stream that is used to execute the operation.
   * @param unique_key If all keys in the same batch are unique.
   * @param locked_key_ptrs If it isn't nullptr then the keys in the table will
   * be locked, and key's address will write to locked_key_ptrs. Using
   * unlock_keys to unlock these keys.
   *
   */
  virtual void find_or_insert(const size_type n, const key_type* keys,  // (n)
                              value_type** values,                      // (n)
                              bool* founds,                             // (n)
                              score_type* scores = nullptr,             // (n)
                              cudaStream_t stream = 0, bool unique_key = true,
                              bool ignore_evict_strategy = false,
                              key_type** locked_key_ptrs = nullptr) = 0;

  /**
   * @brief
   * This function will lock the keys in the table and unexisted keys will be
   * ignored.
   *
   * @param n The number of keys in the table to be locked.
   * @param locked_key_ptrs The pointers of locked keys in the table with shape
   * (n).
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param succeededs The status that indicates if the lock operation is
   * succeed.
   * @param scores The scores of the input keys will set to scores if provided.
   * @param stream The CUDA stream that is used to execute the operation.
   *
   */
  virtual void lock_keys(const size_type n,
                         key_type const* keys,        // (n)
                         key_type** locked_key_ptrs,  // (n)
                         bool* succeededs = nullptr,  // (n)
                         cudaStream_t stream = 0,
                         score_type const* scores = nullptr) = 0;

  /**
   * @brief Using pointers to address the keys in the hash table and set them
   * to target keys.
   * This function will unlock the keys in the table which are locked by
   * the previous call to find_or_insert.
   *
   * @param n The number of keys in the table to be unlocked.
   * @param locked_key_ptrs The pointers of locked keys in the table with shape
   * (n).
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param succeededs The status that indicates if the unlock operation is
   * succeed.
   * @param stream The CUDA stream that is used to execute the operation.
   *
   */
  virtual void unlock_keys(const size_type n,
                           key_type** locked_key_ptrs,  // (n)
                           const key_type* keys,        // (n)
                           bool* succeededs = nullptr,  // (n)
                           cudaStream_t stream = 0) = 0;

  /**
   * @brief Assign new key-value-score tuples into the hash table.
   * If the key doesn't exist, the operation on the key will be ignored.
   *
   * @param n Number of key-value-score tuples to insert or assign.
   * @param keys The keys to insert on GPU-accessible memory with shape
   * (n).
   * @param values The values to insert on GPU-accessible memory with
   * shape (n, DIM).
   * @param scores The scores to insert on GPU-accessible memory with shape
   * (n).
   * @parblock
   * The scores should be a `uint64_t` value for built-in strategies. For
   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.
   * You can specify a value such as the timestamp of the key insertion or
   * number of key occurrences to perform a customized eviction strategy.
   *
   * The @p scores should be `nullptr`, when the LRU eviction strategy is
   * applied.
   * @endparblock
   *
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @param unique_key If all keys in the same batch are unique.
   */
  virtual void assign(const size_type n,
                      const key_type* keys,                // (n)
                      const value_type* values,            // (n, DIM)
                      const score_type* scores = nullptr,  // (n)
                      cudaStream_t stream = 0, bool unique_key = true) = 0;

  /**
   * @brief Assign new scores for keys.
   * If the key doesn't exist, the operation on the key will be ignored.
   *
   * @param n Number of key-score pairs to assign.
   * @param keys The keys to insert on GPU-accessible memory with shape
   * (n).
   * @parblock
   * The scores should be a `uint64_t` value for built-in strategies. For
   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.
   * You can specify a value such as the timestamp of the key insertion or
   * number of key occurrences to perform a customized eviction strategy.
   *
   * The @p scores should be `nullptr`, when the LRU eviction strategy is
   * applied.
   * @endparblock
   *
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @param unique_key If all keys in the same batch are unique.
   */
  virtual void assign_scores(const size_type n,
                             const key_type* keys,                // (n)
                             const score_type* scores = nullptr,  // (n)
                             cudaStream_t stream = 0,
                             bool unique_key = true) = 0;

  /**
   * @brief Alias of `assign_scores`.
   */
  virtual void assign(const size_type n,
                      const key_type* keys,                // (n)
                      const score_type* scores = nullptr,  // (n)
                      cudaStream_t stream = 0, bool unique_key = true) = 0;

  /**
   * @brief Assign new values for each keys .
   * If the key doesn't exist, the operation on the key will be ignored.
   *
   * @param n Number of key-value pairs to assign.
   * @param keys The keys need to be operated, which must be on GPU-accessible
   * memory with shape (n).
   * @param values The values need to be updated, which must be on
   * GPU-accessible memory with shape (n, DIM).
   *
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @param unique_key If all keys in the same batch are unique.
   */
  virtual void assign_values(const size_type n,
                             const key_type* keys,      // (n)
                             const value_type* values,  // (n, DIM)
                             cudaStream_t stream = 0,
                             bool unique_key = true) = 0;
  /**
   * @brief Searches the hash table for the specified keys.
   *
   * @note When a key is missing, the value in @p values is not changed.
   *
   * @param n The number of key-value-score tuples to search.
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param values The values to search on GPU-accessible memory with
   * shape (n, DIM).
   * @param founds The status that indicates if the keys are found on
   * GPU-accessible memory with shape (n).
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   * @param stream The CUDA stream that is used to execute the operation.
   *
   */
  virtual void find(const size_type n, const key_type* keys,  // (n)
                    value_type* values,                       // (n, DIM)
                    bool* founds,                             // (n)
                    score_type* scores = nullptr,             // (n)
                    cudaStream_t stream = 0) const = 0;

  /**
   * @brief Searches the hash table for the specified keys.
   *
   * @note When the searched keys are not hit, missed keys/indices/size can be
   * obtained.
   *
   * @param n The number of key-value-score tuples to search.
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param values The values to search on GPU-accessible memory with
   * shape (n, DIM).
   * @param missed_keys The missed keys to search on GPU-accessible memory with
   * shape (n).
   * @param missed_indices The missed indices to search on GPU-accessible memory
   * with shape (n).
   * @param missed_size The size of `missed_keys` and `missed_indices`.
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   * @param stream The CUDA stream that is used to execute the operation.
   */
  virtual void find(const size_type n, const key_type* keys,  // (n)
                    value_type* values,                       // (n, DIM)
                    key_type* missed_keys,                    // (n)
                    int* missed_indices,                      // (n)
                    int* missed_size,                         // scalar
                    score_type* scores = nullptr,             // (n)
                    cudaStream_t stream = 0) const = 0;

  /**
   * @brief Searches the hash table for the specified keys and returns address
   * of the values.
   *
   * @note When a key is missing, the data in @p values won't change.
   * @warning This API returns internal addresses for high-performance but
   * thread-unsafe. The caller is responsible for guaranteeing data consistency.
   *
   * @param n The number of key-value-score tuples to search.
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param values The addresses of values to search on GPU-accessible memory
   * with shape (n).
   * @param founds The status that indicates if the keys are found on
   * GPU-accessible memory with shape (n).
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   * @param stream The CUDA stream that is used to execute the operation.
   * @param unique_key If all keys in the same batch are unique.
   *
   */
  virtual void find(const size_type n, const key_type* keys,  // (n)
                    value_type** values,                      // (n)
                    bool* founds,                             // (n)
                    score_type* scores = nullptr,             // (n)
                    cudaStream_t stream = 0, bool unique_key = true) const = 0;

  /**
   * @brief Searches the hash table for the specified keys and returns address
   * of the values, and will update the scores.
   *
   * @note When a key is missing, the data in @p values won't change.
   * @warning This API returns internal addresses for high-performance but
   * thread-unsafe. The caller is responsible for guaranteeing data consistency.
   *
   * @param n The number of key-value-score tuples to search.
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param values The addresses of values to search on GPU-accessible memory
   * with shape (n).
   * @param founds The status that indicates if the keys are found on
   * GPU-accessible memory with shape (n).
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   * @param stream The CUDA stream that is used to execute the operation.
   * @param unique_key If all keys in the same batch are unique.
   *
   */
  virtual void find_and_update(const size_type n, const key_type* keys,  // (n)
                               value_type** values,                      // (n)
                               bool* founds,                             // (n)
                               score_type* scores = nullptr,             // (n)
                               cudaStream_t stream = 0,
                               bool unique_key = true) = 0;

  /**
   * @brief Checks if there are elements with key equivalent to `keys` in the
   * table.
   *
   * @param n The number of `keys` to check.
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param founds The result that indicates if the keys are found, and should
   * be allocated by caller on GPU-accessible memory with shape (n).
   * @param stream The CUDA stream that is used to execute the operation.
   *
   */
  virtual void contains(const size_type n, const key_type* keys,  // (n)
                        bool* founds,                             // (n)
                        cudaStream_t stream = 0) const = 0;

  /**
   * @brief Removes specified elements from the hash table.
   *
   * @param n The number of keys to remove.
   * @param keys The keys to remove on GPU-accessible memory.
   * @param stream The CUDA stream that is used to execute the operation.
   *
   */
  virtual void erase(const size_type n, const key_type* keys,
                     cudaStream_t stream = 0) = 0;

  /**
   * @brief Removes all of the elements in the hash table with no release
   * object.
   */
  virtual void clear(cudaStream_t stream = 0) = 0;

  /**
   * @brief Exports a certain number of the key-value-score tuples from the
   * hash table.
   *
   * @param n The maximum number of exported pairs.
   * @param offset The position of the key to search.
   * @param d_counter Accumulates amount of successfully exported values.
   * @param keys The keys to dump from GPU-accessible memory with shape (n).
   * @param values The values to dump from GPU-accessible memory with shape
   * (n, DIM).
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   *
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @return The number of elements dumped.
   *
   * @throw CudaException If the key-value size is too large for GPU shared
   * memory. Reducing the value for @p n is currently required if this exception
   * occurs.
   */
  virtual void export_batch(size_type n, const size_type offset,
                            size_type* d_counter,          // (1)
                            key_type* keys,                // (n)
                            value_type* values,            // (n, DIM)
                            score_type* scores = nullptr,  // (n)
                            cudaStream_t stream = 0) const = 0;

  virtual size_type export_batch(const size_type n, const size_type offset,
                                 key_type* keys,                // (n)
                                 value_type* values,            // (n, DIM)
                                 score_type* scores = nullptr,  // (n)
                                 cudaStream_t stream = 0) const = 0;

  /**
   * @brief Indicates if the hash table has no elements.
   *
   * @param stream The CUDA stream that is used to execute the operation.
   * @return `true` if the table is empty and `false` otherwise.
   */
  virtual bool empty(cudaStream_t stream = 0) const = 0;

  /**
   * @brief Returns the hash table size.
   *
   * @param stream The CUDA stream that is used to execute the operation.
   * @return The table size.
   */
  virtual size_type size(cudaStream_t stream = 0) const = 0;

  /**
   * @brief Returns the hash table capacity.
   *
   * @note The value that is returned might be less than the actual capacity of
   * the hash table because the hash table currently keeps the capacity to be
   * a power of 2 for performance considerations.
   *
   * @return The table capacity.
   */
  virtual size_type capacity() const = 0;

  /**
   * @brief Sets the number of buckets to the number that is needed to
   * accommodate at least @p new_capacity elements without exceeding the maximum
   * load factor. This method rehashes the hash table. Rehashing puts the
   * elements into the appropriate buckets considering that total number of
   * buckets has changed.
   *
   * @note If the value of @p new_capacity or double of @p new_capacity is
   * greater or equal than `options_.max_capacity`, the reserve does not perform
   * any change to the hash table.
   *
   * @param new_capacity The requested capacity for the hash table.
   * @param stream The CUDA stream that is used to execute the operation.
   */
  virtual void reserve(const size_type new_capacity,
                       cudaStream_t stream = 0) = 0;

  /**
   * @brief Returns the average number of elements per slot, that is, size()
   * divided by capacity().
   *
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @return The load factor
   */
  virtual float load_factor(cudaStream_t stream = 0) const = 0;

  /**
   * @brief Set max_capacity of the table.
   *
   * @param new_max_capacity The new expecting max_capacity. It must be power
   * of 2. Otherwise it will raise an error.
   */
  virtual void set_max_capacity(size_type new_max_capacity) = 0;

  /**
   * @brief Returns the dimension of the vectors.
   *
   * @return The dimension of the vectors.
   */
  virtual size_type dim() const noexcept = 0;

  /**
   * @brief Returns The length of each bucket.
   *
   * @return The length of each bucket.
   */
  virtual size_type max_bucket_size() const noexcept = 0;

  /**
   * @brief Returns the number of buckets in the table.
   *
   * @return The number of buckets in the table.
   */
  virtual size_type bucket_count() const noexcept = 0;

  /**
   * @brief Save keys, vectors, scores in table to file or files.
   *
   * @param file A BaseKVFile object defined the file format on host filesystem.
   * @param max_workspace_size Saving is conducted in chunks. This value denotes
   * the maximum amount of temporary memory to use when dumping the table.
   * Larger values *can* lead to higher performance.
   * @param stream The CUDA stream used to execute the operation.
   *
   * @return Number of KV pairs saved to file.
   */
  virtual size_type save(BaseKVFile<K, V, S>* file,
                         const size_t max_workspace_size = 1L * 1024 * 1024,
                         cudaStream_t stream = 0) const = 0;

  /**
   * @brief Load keys, vectors, scores from file to table.
   *
   * @param file An BaseKVFile defined the file format within filesystem.
   * @param max_workspace_size Loading is conducted in chunks. This value
   * denotes the maximum size of such chunks. Larger values *can* lead to higher
   * performance.
   * @param stream The CUDA stream used to execute the operation.
   *
   * @return Number of keys loaded from file.
   */
  virtual size_type load(BaseKVFile<K, V, S>* file,
                         const size_t max_workspace_size = 1L * 1024 * 1024,
                         cudaStream_t stream = 0) = 0;

  virtual void set_global_epoch(const uint64_t epoch) = 0;
};

/**
 * A HierarchicalKV hash table is a concurrent and hierarchical hash table that
 * is powered by GPUs and can use HBM and host memory as storage for key-value
 * pairs. Support for SSD storage is a future consideration.
 *
 * The `score` is introduced to define the importance of each key, the
 * larger, the more important, the less likely they will be evicted. Eviction
 * occurs automatically when a bucket is full. The keys with the minimum `score`
 * value are evicted first. In a customized eviction strategy, we recommend
 * using the timestamp or frequency of the key occurrence as the `score` value
 * for each key. You can also assign a special value to the `score` to
 * perform a customized eviction strategy.
 *
 * @note By default configuration, this class is thread-safe.
 *
 * @tparam K The data type of the key.
 * @tparam V The data type of the vector's item type.
 *         The item data type should be a basic data type of C++/CUDA.
 * @tparam S The data type for `score`.
 *           Supported types: `uint64_t` and `uint32_t` (only for
 *           `EvictStrategy::kCustomized`).
 *
 */
template <typename K, typename V, typename S = uint64_t,
          int Strategy = EvictStrategy::kLru, typename ArchTag = Sm80>
class HashTable : public HashTableBase<K, V, S> {
 public:
  using size_type = size_t;
  using key_type = K;
  using value_type = V;
  using score_type = S;
  static constexpr int evict_strategy = Strategy;

  using Pred = EraseIfPredict<key_type, score_type>;
  using allocator_type = BaseAllocator;

 private:
  using TableCore = nv::merlin::Table<key_type, value_type, score_type>;
  static constexpr unsigned int TILE_SIZE = 4;

  using DeviceMemoryPool = MemoryPool<DeviceAllocator<char>>;
  using HostMemoryPool = MemoryPool<HostAllocator<char>>;

 public:
  /**
   * @brief Default constructor for the hash table class.
   */
  HashTable() {
    static_assert((std::is_same<key_type, int64_t>::value ||
                   std::is_same<key_type, uint64_t>::value),
                  "The key_type must be int64_t or uint64_t.");

    static_assert((std::is_same<score_type, uint64_t>::value ||
                   std::is_same<score_type, uint32_t>::value),
                  "The score_type must be uint64_t or uint32_t.");

    // Incompatible: Epoch-based strategies encode epoch(hi32)|score(lo32),
    // require 64-bit score
    static_assert(!(std::is_same<score_type, uint32_t>::value &&
                    (evict_strategy != EvictStrategy::kCustomized)),
                  "score_type uint32_t is only compatible with Customized; "
                  "use uint64_t.");
  };

  /**
   * @brief Frees the resources used by the hash table and destroys the hash
   * table object.
   */
  ~HashTable() {
    if (initialized_) {
      CUDA_CHECK(cudaDeviceSynchronize());

      initialized_ = false;
      destroy_table<key_type, value_type, score_type>(&table_, allocator_);
      allocator_->free(MemoryType::Device, d_table_);
      dev_mem_pool_.reset();
      host_mem_pool_.reset();

      CUDA_CHECK(cudaDeviceSynchronize());
      if (default_allocator_ && allocator_ != nullptr) {
        delete allocator_;
      }
    }
  }

 private:
  HashTable(const HashTable&) = delete;
  HashTable& operator=(const HashTable&) = delete;
  HashTable(HashTable&&) = delete;
  HashTable& operator=(HashTable&&) = delete;

 public:
  /**
   * @brief Initialize a merlin::HashTable.
   *
   * @param options The configuration options.
   */
  void init(const HashTableOptions& options,
            allocator_type* allocator = nullptr) {
    if (initialized_) {
      return;
    }
    options_ = options;

    // MEMORY_MODE (dual-bucket) specific initialization.
    if (options_.table_mode == TableMode::kMemory) {
      // Note: dual-bucket mode does not use max_load_factor for rehash
      // triggering.  The effective load factor is governed entirely by the
      // score-based eviction mechanism.  We intentionally leave
      // max_load_factor at its default value and never consult it.
      MERLIN_CHECK(options_.init_capacity == options_.max_capacity,
                   "[MEMORY_MODE] init_capacity must equal max_capacity. "
                   "Auto-rehash is not supported in dual-bucket mode.");
      MERLIN_CHECK(options_.max_hbm_for_vectors == 0,
                   "[MEMORY_MODE] Only pure HBM (fast mode) is supported. "
                   "Set max_hbm_for_vectors = 0.");
      MERLIN_CHECK(
          options_.dim * sizeof(value_type) <= 224 * sizeof(float),
          "[MEMORY_MODE] dim * sizeof(V) must not exceed 896 bytes "
          "(i.e. dim <= 224 for float). The dual-bucket lookup kernel uses a "
          "fixed-size shared memory buffer that cannot accommodate larger "
          "value vectors.");
      MERLIN_CHECK(
          options_.init_capacity / options_.max_bucket_size >= 2,
          "[MEMORY_MODE] capacity must provide at least 2 buckets "
          "(capacity >= 2 * max_bucket_size). Dual-bucket addressing "
          "requires b1 != b2, which is impossible with a single bucket.");
    }

    MERLIN_CHECK(options.reserved_key_start_bit >= 0 &&
                     options.reserved_key_start_bit <= MAX_RESERVED_KEY_BIT,
                 "options.reserved_key_start_bit should >= 0 and <= 62.");
    CUDA_CHECK(init_reserved_keys(options.reserved_key_start_bit));

    default_allocator_ = (allocator == nullptr);
    allocator_ = (allocator == nullptr) ? (new DefaultAllocator()) : allocator;

    thrust_allocator_.set_allocator(allocator_);

    if (options_.device_id >= 0) {
      CUDA_CHECK(cudaSetDevice(options_.device_id));
    } else {
      CUDA_CHECK(cudaGetDevice(&(options_.device_id)));
    }

    MERLIN_CHECK(ispow2(static_cast<uint32_t>(options_.max_bucket_size)),
                 "Bucket size should be the pow of 2");
    MERLIN_CHECK(
        ispow2(static_cast<uint32_t>(options_.num_of_buckets_per_alloc)),
        "Then `num_of_buckets_per_alloc` should be the pow of 2");
    MERLIN_CHECK(options_.init_capacity >= options_.num_of_buckets_per_alloc *
                                               options_.max_bucket_size,
                 "Then `num_of_buckets_per_alloc` must be equal or less than "
                 "initial required buckets number");

    options_.block_size = SAFE_GET_BLOCK_SIZE(options_.block_size);

    MERLIN_CHECK(
        (((options_.max_bucket_size * (sizeof(key_type) + sizeof(score_type))) %
          128) == 0),
        "Storage size of keys and scores in one bucket should be the mutiple "
        "of cache line size");

    // Construct table.
    cudaDeviceProp deviceProp;
    CUDA_CHECK(cudaGetDeviceProperties(&deviceProp, options_.device_id));
    shared_mem_size_ = deviceProp.sharedMemPerBlock;
    sm_cnt_ = deviceProp.multiProcessorCount;
    max_threads_per_block_ = deviceProp.maxThreadsPerBlock;
    const bool is_memory_mode = (options_.table_mode == TableMode::kMemory);
    create_table<key_type, value_type, score_type>(
        &table_, allocator_, options_.dim, options_.init_capacity,
        options_.max_capacity, options_.max_hbm_for_vectors,
        options_.max_bucket_size, options_.num_of_buckets_per_alloc,
        /*tile_size=*/32, /*primary=*/true,
        /*dual_bucket_mode=*/is_memory_mode);
    options_.block_size = SAFE_GET_BLOCK_SIZE(options_.block_size);
    reach_max_capacity_ = (options_.init_capacity * 2 > options_.max_capacity);

    // MEMORY_MODE: force disable auto-rehash.
    if (is_memory_mode) {
      reach_max_capacity_ = true;  // Disable auto-rehash.
    }

    MERLIN_CHECK((!(options_.io_by_cpu && options_.max_hbm_for_vectors != 0)),
                 "[HierarchicalKV] `io_by_cpu` should not be true when "
                 "`max_hbm_for_vectors` is not 0!");
    allocator_->alloc(MemoryType::Device, (void**)&(d_table_),
                      sizeof(TableCore));

    sync_table_configuration();

    // Create memory pools.
    dev_mem_pool_ = std::make_unique<MemoryPool<DeviceAllocator<char>>>(
        options_.device_memory_pool, allocator_);
    host_mem_pool_ = std::make_unique<MemoryPool<HostAllocator<char>>>(
        options_.host_memory_pool, allocator_);

    CUDA_CHECK(cudaDeviceSynchronize());

    initialized_ = true;
    CudaCheckError();
  }

  /**
   * @brief Insert new key-value-score tuples into the hash table.
   * If the key already exists, the values and scores are assigned new values.
   *
   * If the target bucket is full, the keys with minimum score will be
   * overwritten by new key unless the score of the new key is even less than
   * minimum score of the target bucket.
   *
   * @param n Number of key-value-score tuples to insert or assign.
   * @param keys The keys to insert on GPU-accessible memory with shape
   * (n).
   * @param values The values to insert on GPU-accessible memory with
   * shape (n, DIM).
   * @param scores The scores to insert on GPU-accessible memory with shape
   * (n).
   * @parblock
   * The scores should be a `uint64_t` value for built-in strategies. For
   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.
   * You can specify a value such as the timestamp of the key insertion or
   * number of key occurrences to perform a customized eviction strategy.
   *
   * The @p scores should be `nullptr`, when the LRU eviction strategy is
   * applied.
   * @endparblock
   *
   * @param stream The CUDA stream that is used to execute the operation.
   * @param unique_key If all keys in the same batch are unique.
   *
   * @param ignore_evict_strategy A boolean option indicating whether if
   * the insert_or_assign ignores the evict strategy of table with current
   * scores anyway. If true, it does not check whether the scores conforms to
   * the evict strategy. If false, it requires the scores follow the evict
   * strategy of table.
   */
  void insert_or_assign(const size_type n,
                        const key_type* keys,                // (n)
                        const value_type* values,            // (n, DIM)
                        const score_type* scores = nullptr,  // (n)
                        cudaStream_t stream = 0, bool unique_key = true,
                        bool ignore_evict_strategy = false) {
    if (ignore_evict_strategy) {
      insert_or_assign_impl<EvictStrategy::kCustomized>(
          n, keys, values, scores, stream, unique_key, ignore_evict_strategy);
    } else {
      insert_or_assign_impl<evict_strategy>(n, keys, values, scores, stream,
                                            unique_key, ignore_evict_strategy);
    }
  }

  template <int evict_strategy_>
  void insert_or_assign_impl(const size_type n,
                             const key_type* keys,      // (n)
                             const value_type* values,  // (n, DIM)
                             const score_type* scores,  // (n)
                             cudaStream_t stream, bool unique_key,
                             bool ignore_evict_strategy) {
    if (n == 0) {
      return;
    }

    while (!reach_max_capacity_ &&
           fast_load_factor(n, stream) > options_.max_load_factor) {
      reserve(capacity() * 2, stream);
    }

    if (!ignore_evict_strategy) {
      check_evict_strategy(scores);
    }

    std::unique_ptr<insert_unique_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<insert_unique_lock>(mutex_, stream);
    }

    // MEMORY_MODE: dual-bucket upsert.
    if (is_memory_mode()) {
      MERLIN_CHECK(unique_key,
                   "[MEMORY_MODE] insert_or_assign requires unique_key=true "
                   "in dual-bucket mode.");

      using DualSelector =
          KernelSelector_DualBucketUpsert<key_type, value_type, score_type,
                                          evict_strategy_, ArchTag>;
      typename DualSelector::Params kernelParams(
          /*load_factor=*/0.0f, table_->buckets, table_->buckets_size,
          table_->buckets_num, static_cast<uint32_t>(options_.max_bucket_size),
          static_cast<uint32_t>(options_.dim), keys, values, scores, n,
          global_epoch_);
      DualSelector::select_kernel(kernelParams, stream);
      CudaCheckError();
      return;
    }

    if (is_fast_mode()) {
      static thread_local int step_counter = 0;
      static thread_local float load_factor = 0.0;

      if (((step_counter++) % kernel_select_interval_) == 0) {
        load_factor = fast_load_factor(0, stream, false);
      }

      using Selector = KernelSelector_Upsert<key_type, value_type, score_type,
                                             evict_strategy_, ArchTag>;
      if (Selector::callable(unique_key,
                             static_cast<uint32_t>(options_.max_bucket_size),
                             static_cast<uint32_t>(options_.dim))) {
        typename Selector::Params kernelParams(
            load_factor, table_->buckets, table_->buckets_size,
            table_->buckets_num,
            static_cast<uint32_t>(options_.max_bucket_size),
            static_cast<uint32_t>(options_.dim), keys, values, scores, n,
            global_epoch_);
        Selector::select_kernel(kernelParams, stream);
      } else {
        using Selector = SelectUpsertKernelWithIO<key_type, value_type,
                                                  score_type, evict_strategy_>;
        Selector::execute_kernel(
            load_factor, options_.block_size, options_.max_bucket_size,
            table_->buckets_num, options_.dim, stream, n, d_table_,
            table_->buckets, keys, reinterpret_cast<const value_type*>(values),
            scores, global_epoch_);
      }
    } else {
      auto sortOp = SortPairOp<uintptr_t, int>();
      auto d_sort_bytes = sortOp.get_storage_bytes(n, stream);

      MultiVector<value_type*, int, value_type*, int, key_type*, uint8_t> mv(
          n, n, n, n, n, d_sort_bytes);
      const size_type dev_ws_size = mv.total_size();
      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
      auto temp_storage = dev_ws.get<uint8_t*>(0);
      auto d_dst = get_vector<0>(mv, temp_storage);
      auto d_src_offset = get_vector<1>(mv, temp_storage);
      auto d_dst_sorted = get_vector<2>(mv, temp_storage);
      auto d_src_offset_sorted = get_vector<3>(mv, temp_storage);
      auto keys_ptr = get_vector<4>(mv, temp_storage);
      auto d_sort_storage = get_vector<5>(mv, temp_storage);
      sortOp.set_storage(reinterpret_cast<void*>(d_sort_storage));

      CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream));

      constexpr uint32_t MinBucketCapacityFilter =
          sizeof(VecD_Load) / sizeof(D);

      bool filter_condition =
          unique_key && options_.max_bucket_size >= MinBucketCapacityFilter &&
          !options_.io_by_cpu;

      if (filter_condition) {
        constexpr uint32_t BLOCK_SIZE = 128;

        upsert_kernel_lock_key_hybrid<key_type, value_type, score_type,
                                      BLOCK_SIZE, evict_strategy_>
            <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                table_->buckets, table_->buckets_size, table_->buckets_num,
                options_.max_bucket_size, options_.dim, keys, d_dst, scores,
                keys_ptr, d_src_offset, n, global_epoch_);

      } else {
        const size_t block_size = options_.block_size;
        const size_t N = n * TILE_SIZE;
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        upsert_kernel<key_type, value_type, score_type, evict_strategy_,
                      TILE_SIZE><<<grid_size, block_size, 0, stream>>>(
            d_table_, table_->buckets, options_.max_bucket_size,
            table_->buckets_num, options_.dim, keys, d_dst, scores,
            d_src_offset, global_epoch_, N);
      }

      sortOp.sort(n, reinterpret_cast<uintptr_t*>(d_dst),
                  reinterpret_cast<uintptr_t*>(d_dst_sorted), d_src_offset,
                  d_src_offset_sorted, stream);

      if (filter_condition) {
        const size_t block_size = options_.io_block_size;
        const size_t N = n * dim();
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        write_kernel_unlock_key<key_type, value_type, score_type>
            <<<grid_size, block_size, 0, stream>>>(values, d_dst_sorted,
                                                   d_src_offset_sorted, dim(),
                                                   keys, keys_ptr, N);

      } else if (options_.io_by_cpu) {
        MultiVector<value_type*, int, value_type> mv1(n, n, n * dim());
        const size_type host_ws_size = mv1.total_size();
        auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};
        auto host_temp_storage = host_ws.get<uint8_t*>(0);
        auto h_dst_sorted = get_vector<0>(mv1, host_temp_storage);
        auto h_src_offset_sorted = get_vector<1>(mv1, host_temp_storage);
        auto h_values = get_vector<2>(mv1, host_temp_storage);

        CUDA_CHECK(cudaMemcpyAsync(h_dst_sorted, d_dst_sorted, mv1.offset(2),
                                   cudaMemcpyDeviceToHost, stream));
        CUDA_CHECK(cudaMemcpyAsync(h_values, values,
                                   n * dim() * sizeof(value_type),
                                   cudaMemcpyDeviceToHost, stream));
        CUDA_CHECK(cudaStreamSynchronize(stream));

        write_by_cpu<value_type>(h_dst_sorted, h_values, h_src_offset_sorted,
                                 dim(), n);
      } else {
        const size_t block_size = options_.io_block_size;
        const size_t N = n * dim();
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        write_kernel<key_type, value_type, score_type>
            <<<grid_size, block_size, 0, stream>>>(
                values, d_dst_sorted, d_src_offset_sorted, dim(), N);
      }
    }

    CudaCheckError();
  }

  /**
   * @brief Insert new key-value-score tuples into the hash table.
   * If the key already exists, the values and scores are assigned new values.
   *
   * If the target bucket is full, the keys with minimum score will be
   * overwritten by new key unless the score of the new key is even less than
   * minimum score of the target bucket. The overwritten key with minimum
   * score will be evicted, with its values and score, to evicted_keys,
   * evicted_values, evcted_scores seperately in compact format.
   *
   * @param n Number of key-value-score tuples to insert or assign.
   * @param keys The keys to insert on GPU-accessible memory with shape
   * (n).
   * @param values The values to insert on GPU-accessible memory with
   * shape (n, DIM).
   * @param scores The scores to insert on GPU-accessible memory with shape
   * (n).
   * @params evicted_keys The output of keys replaced with minimum score.
   * @params evicted_values The output of values replaced with minimum score on
   * keys.
   * @params evicted_scores The output of scores replaced with minimum score on
   * keys.
   * @parblock
   * The scores should be a `uint64_t` value for built-in strategies. For
   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.
   * You can specify a value such as the timestamp of the key insertion or
   * number of key occurrences to perform a customized eviction strategy.
   *
   * The @p scores should be `nullptr`, when the LRU eviction strategy is
   * applied.
   * @endparblock
   *
   * @param d_evicted_counter The number of elements evicted on GPU-accessible
   * memory. @notice The caller should guarantee it is set to `0` before
   * calling.
   * @param stream The CUDA stream that is used to execute the operation.
   * @param unique_key If all keys in the same batch are unique.
   *
   * @param ignore_evict_strategy A boolean option indicating whether if
   * the insert_or_assign ignores the evict strategy of table with current
   * scores anyway. If true, it does not check whether the scores confroms to
   * the evict strategy. If false, it requires the scores follow the evict
   * strategy of table.
   */
  void insert_and_evict(const size_type n,
                        const key_type* keys,          // (n)
                        const value_type* values,      // (n, DIM)
                        const score_type* scores,      // (n)
                        key_type* evicted_keys,        // (n)
                        value_type* evicted_values,    // (n, DIM)
                        score_type* evicted_scores,    // (n)
                        size_type* d_evicted_counter,  // (1)
                        cudaStream_t stream = 0, bool unique_key = true,
                        bool ignore_evict_strategy = false) {
    MERLIN_CHECK(
        !is_memory_mode(),
        "[MEMORY_MODE] insert_and_evict() is not supported in dual-bucket "
        "mode. Use insert_or_assign() instead.");
    if (n == 0) {
      return;
    }

    while (!reach_max_capacity_ &&
           fast_load_factor(n, stream) > options_.max_load_factor) {
      reserve(capacity() * 2, stream);
    }

    if (!ignore_evict_strategy) {
      check_evict_strategy(scores);
    }

    std::unique_ptr<insert_unique_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<insert_unique_lock>(mutex_, stream);
    }

    // TODO: Currently only need eviction when using HashTable as HBM cache.
    if (!is_fast_mode()) {
      throw std::runtime_error("Only allow insert_and_evict in pure HBM mode.");
    }

    static thread_local int step_counter = 0;
    static thread_local float load_factor = 0.0;

    if (((step_counter++) % kernel_select_interval_) == 0) {
      load_factor = fast_load_factor(0, stream, false);
    }

    using Selector =
        KernelSelector_UpsertAndEvict<key_type, value_type, score_type,
                                      evict_strategy, ArchTag>;
    if (Selector::callable(unique_key,
                           static_cast<uint32_t>(options_.max_bucket_size),
                           static_cast<uint32_t>(options_.dim))) {
      typename Selector::Params kernelParams(
          load_factor, table_->buckets, table_->buckets_size,
          table_->buckets_num, static_cast<uint32_t>(options_.max_bucket_size),
          static_cast<uint32_t>(options_.dim), keys, values, scores,
          evicted_keys, evicted_values, evicted_scores, n, d_evicted_counter,
          global_epoch_);
      Selector::select_kernel(kernelParams, stream);
    } else if (unique_key and options_.max_bucket_size % 16 == 0) {
      using KernelLauncher =
          InsertAndEvictKernelLauncher<key_type, value_type, score_type,
                                       evict_strategy>;
      typename KernelLauncher::Params kernelParams(
          load_factor, table_->buckets, table_->buckets_size,
          table_->buckets_num, static_cast<uint32_t>(options_.max_bucket_size),
          static_cast<uint32_t>(options_.dim), keys, values, scores,
          evicted_keys, evicted_values, evicted_scores, n, d_evicted_counter,
          global_epoch_);
      KernelLauncher::launch_kernel(kernelParams, stream);
    } else {
      // always use max tile to avoid data-deps as possible.
      const int TILE_SIZE = 32;
      size_t n_offsets = (n + TILE_SIZE - 1) / TILE_SIZE;
      const size_type dev_ws_size =
          n * (sizeof(key_type) + sizeof(score_type)) +
          n_offsets * sizeof(int64_t) + n * dim() * sizeof(value_type) +
          n * sizeof(bool);

      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
      auto tmp_evict_keys{dev_ws.get<key_type*>(0)};
      auto tmp_evict_scores = reinterpret_cast<score_type*>(tmp_evict_keys + n);
      auto d_offsets = reinterpret_cast<int64_t*>(tmp_evict_scores + n);
      auto tmp_evict_values =
          reinterpret_cast<value_type*>(d_offsets + n_offsets);
      auto d_masks = reinterpret_cast<bool*>(tmp_evict_values + n * dim());

      CUDA_CHECK(
          cudaMemsetAsync(d_offsets, 0, n_offsets * sizeof(int64_t), stream));
      CUDA_CHECK(cudaMemsetAsync(d_masks, 0, n * sizeof(bool), stream));

      size_type block_size = options_.block_size;
      size_type grid_size = SAFE_GET_GRID_SIZE(n, block_size);
      CUDA_CHECK(memset64Async(tmp_evict_keys, EMPTY_KEY_CPU, n, stream));
      using Selector =
          SelectUpsertAndEvictKernelWithIO<key_type, value_type, score_type,
                                           evict_strategy>;
      Selector::execute_kernel(
          load_factor, options_.block_size, options_.max_bucket_size,
          table_->buckets_num, options_.dim, stream, n, d_table_,
          table_->buckets, keys, values, scores, tmp_evict_keys,
          tmp_evict_values, tmp_evict_scores, global_epoch_);
      keys_not_empty<K>
          <<<grid_size, block_size, 0, stream>>>(tmp_evict_keys, d_masks, n);

      gpu_cell_count<int64_t, TILE_SIZE><<<grid_size, block_size, 0, stream>>>(
          d_masks, d_offsets, n, d_evicted_counter);

      void* d_temp_storage = nullptr;
      size_t temp_storage_bytes = 0;
      cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,
                                    d_offsets, d_offsets, n_offsets, stream);
      auto dev_ws1{dev_mem_pool_->get_workspace<1>(temp_storage_bytes, stream)};
      d_temp_storage = dev_ws1.get<void*>(0);
      cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,
                                    d_offsets, d_offsets, n_offsets, stream);

      compact_key_value_score_kernel<K, V, S, int64_t, TILE_SIZE>
          <<<grid_size, block_size, 0, stream>>>(
              d_masks, n, d_offsets, tmp_evict_keys, tmp_evict_values,
              tmp_evict_scores, evicted_keys, evicted_values, evicted_scores,
              dim());
    }
    return;
  }

  /**
   * @brief Insert new key-value-score tuples into the hash table.
   * If the key already exists, the values and scores are assigned new values.
   *
   * If the target bucket is full, the keys with minimum score will be
   * overwritten by new key unless the score of the new key is even less than
   * minimum score of the target bucket. The overwritten key with minimum
   * score will be evicted, with its values and score, to evicted_keys,
   * evicted_values, evcted_scores seperately in compact format.
   *
   * @param n Number of key-value-score tuples to insert or assign.
   * @param keys The keys to insert on GPU-accessible memory with shape
   * (n).
   * @param values The values to insert on GPU-accessible memory with
   * shape (n, DIM).
   * @param scores The scores to insert on GPU-accessible memory with shape
   * (n).
   * @params evicted_keys The output of keys replaced with minimum score.
   * @params evicted_values The output of values replaced with minimum score on
   * keys.
   * @params evicted_scores The output of scores replaced with minimum score on
   * keys.
   * @parblock
   * The scores should be a `uint64_t` value for built-in strategies. For
   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.
   * You can specify a value such as the timestamp of the key insertion or
   * number of key occurrences to perform a customized eviction strategy.
   *
   * The @p scores should be `nullptr`, when the LRU eviction strategy is
   * applied.
   * @endparblock
   *
   * @param stream The CUDA stream that is used to execute the operation.
   * @param unique_key If all keys in the same batch are unique.
   *
   * @param ignore_evict_strategy A boolean option indicating whether if
   * the insert_or_assign ignores the evict strategy of table with current
   * scores anyway. If true, it does not check whether the scores confroms to
   * the evict strategy. If false, it requires the scores follow the evict
   * strategy of table.
   *
   * @return The number of elements evicted.
   */
  size_type insert_and_evict(const size_type n,
                             const key_type* keys,        // (n)
                             const value_type* values,    // (n, DIM)
                             const score_type* scores,    // (n)
                             key_type* evicted_keys,      // (n)
                             value_type* evicted_values,  // (n, DIM)
                             score_type* evicted_scores,  // (n)
                             cudaStream_t stream = 0, bool unique_key = true,
                             bool ignore_evict_strategy = false) {
    if (n == 0) {
      return 0;
    }
    auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)};
    size_type* d_evicted_counter{dev_ws.get<size_type*>(0)};

    CUDA_CHECK(
        cudaMemsetAsync(d_evicted_counter, 0, sizeof(size_type), stream));
    insert_and_evict(n, keys, values, scores, evicted_keys, evicted_values,
                     evicted_scores, d_evicted_counter, stream, unique_key,
                     ignore_evict_strategy);

    size_type h_evicted_counter = 0;
    CUDA_CHECK(cudaMemcpyAsync(&h_evicted_counter, d_evicted_counter,
                               sizeof(size_type), cudaMemcpyDeviceToHost,
                               stream));
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CudaCheckError();
    return h_evicted_counter;
  }

  /**
   * Searches for each key in @p keys in the hash table.
   * If the key is found and the corresponding value in @p accum_or_assigns is
   * `true`, the @p vectors_or_deltas is treated as a delta to the old
   * value, and the delta is added to the old value of the key.
   *
   * If the key is not found and the corresponding value in @p accum_or_assigns
   * is `false`, the @p vectors_or_deltas is treated as a new value and the
   * key-value pair is updated in the table directly.
   *
   * @note When the key is found and the value of @p accum_or_assigns is
   * `false`, or when the key is not found and the value of @p accum_or_assigns
   * is `true`, nothing is changed and this operation is ignored.
   * The algorithm assumes these situations occur while the key was modified or
   * removed by other processes just now.
   *
   * @param n The number of key-value-score tuples to process.
   * @param keys The keys to insert on GPU-accessible memory with shape (n).
   * @param value_or_deltas The values or deltas to insert on GPU-accessible
   * memory with shape (n, DIM).
   * @param accum_or_assigns The operation type with shape (n). A value of
   * `true` indicates to accum and `false` indicates to assign.
   * @param scores The scores to insert on GPU-accessible memory with shape (n).
   * @parblock
   * The scores should be a `uint64_t` value for built-in strategies. For
   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.
   * You can specify a value such as the timestamp of the key insertion or
   * number of key occurrences to perform a customized eviction strategy.
   *
   * The @p scores should be `nullptr`, when the LRU eviction strategy is
   * applied.
   * @endparblock
   *
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @param ignore_evict_strategy A boolean option indicating whether if
   * the accum_or_assign ignores the evict strategy of table with current
   * scores anyway. If true, it does not check whether the scores confroms to
   * the evict strategy. If false, it requires the scores follow the evict
   * strategy of table.
   */
  void accum_or_assign(const size_type n,
                       const key_type* keys,                // (n)
                       const value_type* value_or_deltas,   // (n, DIM)
                       const bool* accum_or_assigns,        // (n)
                       const score_type* scores = nullptr,  // (n)
                       cudaStream_t stream = 0,
                       bool ignore_evict_strategy = false) {
    MERLIN_CHECK(
        !is_memory_mode(),
        "[MEMORY_MODE] accum_or_assign() is not supported in dual-bucket "
        "mode. Use insert_or_assign() instead.");
    if (n == 0) {
      return;
    }

    while (!reach_max_capacity_ &&
           fast_load_factor(n, stream) > options_.max_load_factor) {
      reserve(capacity() * 2, stream);
    }

    if (!ignore_evict_strategy) {
      check_evict_strategy(scores);
    }

    std::unique_ptr<insert_unique_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<insert_unique_lock>(mutex_, stream);
    }

    if (is_fast_mode()) {
      using Selector =
          SelectAccumOrAssignKernelWithIO<key_type, value_type, score_type,
                                          evict_strategy>;
      static thread_local int step_counter = 0;
      static thread_local float load_factor = 0.0;

      if (((step_counter++) % kernel_select_interval_) == 0) {
        load_factor = fast_load_factor(0, stream, false);
      }
      Selector::execute_kernel(
          load_factor, options_.block_size, options_.max_bucket_size,
          table_->buckets_num, dim(), stream, n, d_table_, keys,
          value_or_deltas, scores, accum_or_assigns, global_epoch_);

    } else {
      auto sortOp = SortPairOp<uintptr_t, int>();
      auto d_sort_bytes = sortOp.get_storage_bytes(n, stream);

      MultiVector<value_type*, int, value_type*, int, bool, uint8_t> mv(
          n, n, n, n, n, d_sort_bytes);
      const size_type dev_ws_size = mv.total_size();
      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
      auto temp_storage = dev_ws.get<uint8_t*>(0);
      auto dst = get_vector<0>(mv, temp_storage);
      auto src_offset = get_vector<1>(mv, temp_storage);
      auto dst_sorted = get_vector<2>(mv, temp_storage);
      auto src_offset_sorted = get_vector<3>(mv, temp_storage);
      auto founds = get_vector<4>(mv, temp_storage);
      auto d_sort_storage = get_vector<5>(mv, temp_storage);
      sortOp.set_storage(reinterpret_cast<void*>(d_sort_storage));

      CUDA_CHECK(cudaMemsetAsync(dst, 0, dev_ws_size, stream));

      {
        const size_t block_size = options_.block_size;
        const size_t N = n * TILE_SIZE;
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        accum_or_assign_kernel<key_type, value_type, score_type, evict_strategy,
                               TILE_SIZE><<<grid_size, block_size, 0, stream>>>(
            d_table_, options_.max_bucket_size, table_->buckets_num, dim(),
            keys, dst, scores, accum_or_assigns, src_offset, founds,
            global_epoch_, N);
      }

      sortOp.sort(n, reinterpret_cast<uintptr_t*>(dst),
                  reinterpret_cast<uintptr_t*>(dst_sorted), src_offset,
                  src_offset_sorted, stream);

      {
        const size_t block_size = options_.io_block_size;
        const size_t N = n * dim();
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        write_with_accum_kernel<key_type, value_type, score_type>
            <<<grid_size, block_size, 0, stream>>>(value_or_deltas, dst_sorted,
                                                   accum_or_assigns, founds,
                                                   src_offset_sorted, dim(), N);
      }
    }
    CudaCheckError();
  }

  /**
   * @brief Searches the hash table for the specified keys.
   * When a key is missing, the value in @p values and @p scores will be
   * inserted.
   *
   * @param n The number of key-value-score tuples to search or insert.
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param values The values to search on GPU-accessible memory with
   * shape (n, DIM).
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   * @param stream The CUDA stream that is used to execute the operation.
   * @param unique_key If all keys in the same batch are unique.
   *
   */
  void find_or_insert(const size_type n, const key_type* keys,  // (n)
                      value_type* values,                       // (n * DIM)
                      score_type* scores = nullptr,             // (n)
                      cudaStream_t stream = 0, bool unique_key = true,
                      bool ignore_evict_strategy = false) {
    MERLIN_CHECK(
        !is_memory_mode(),
        "[MEMORY_MODE] find_or_insert() is not supported in dual-bucket mode. "
        "Use insert_or_assign() and find() separately.");
    if (n == 0) {
      return;
    }

    while (!reach_max_capacity_ &&
           fast_load_factor(n, stream) > options_.max_load_factor) {
      reserve(capacity() * 2, stream);
    }

    if (!ignore_evict_strategy) {
      check_evict_strategy(scores);
    }

    std::unique_ptr<insert_unique_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<insert_unique_lock>(mutex_, stream);
    }

    if (is_fast_mode()) {
      static thread_local int step_counter = 0;
      static thread_local float load_factor = 0.0;

      if (((step_counter++) % kernel_select_interval_) == 0) {
        load_factor = fast_load_factor(0, stream, false);
      }

      using Selector =
          KernelSelector_FindOrInsert<key_type, value_type, score_type,
                                      evict_strategy, ArchTag>;
      if (Selector::callable(unique_key,
                             static_cast<uint32_t>(options_.max_bucket_size),
                             static_cast<uint32_t>(options_.dim))) {
        typename Selector::Params kernelParams(
            load_factor, table_->buckets, table_->buckets_size,
            table_->buckets_num,
            static_cast<uint32_t>(options_.max_bucket_size),
            static_cast<uint32_t>(options_.dim), keys, values, scores, n,
            global_epoch_);
        Selector::select_kernel(kernelParams, stream);
      } else {
        using Selector =
            SelectFindOrInsertKernelWithIO<key_type, value_type, score_type,
                                           evict_strategy>;
        Selector::execute_kernel(
            load_factor, options_.block_size, options_.max_bucket_size,
            table_->buckets_num, options_.dim, stream, n, d_table_,
            table_->buckets, keys, values, scores, global_epoch_);
      }
    } else {
      auto sortOp = SortPairOp<uintptr_t, int>();
      auto d_sort_bytes = sortOp.get_storage_bytes(n, stream);

      MultiVector<value_type*, int, value_type*, int, bool, key_type*, uint8_t>
          mv(n, n, n, n, n, n, d_sort_bytes);
      const size_type dev_ws_size = mv.total_size();
      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
      auto temp_storage = dev_ws.get<uint8_t*>(0);
      auto d_table_value_addrs = get_vector<0>(mv, temp_storage);
      auto param_key_index = get_vector<1>(mv, temp_storage);
      auto d_table_value_addrs_sorted = get_vector<2>(mv, temp_storage);
      auto param_key_index_sorted = get_vector<3>(mv, temp_storage);
      auto founds = get_vector<4>(mv, temp_storage);
      auto keys_ptr = get_vector<5>(mv, temp_storage);
      auto d_sort_storage = get_vector<6>(mv, temp_storage);
      sortOp.set_storage(reinterpret_cast<void*>(d_sort_storage));

      CUDA_CHECK(cudaMemsetAsync(d_table_value_addrs, 0, dev_ws_size, stream));

      constexpr uint32_t MinBucketCapacityFilter =
          sizeof(VecD_Load) / sizeof(D);

      bool filter_condition =
          unique_key && options_.max_bucket_size >= MinBucketCapacityFilter &&
          !options_.io_by_cpu;

      if (filter_condition) {
        constexpr uint32_t BLOCK_SIZE = 128;

        find_or_insert_kernel_lock_key_hybrid<key_type, value_type, score_type,
                                              BLOCK_SIZE, evict_strategy>
            <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                table_->buckets, table_->buckets_size, table_->buckets_num,
                options_.max_bucket_size, options_.dim, keys,
                d_table_value_addrs, scores, keys_ptr, param_key_index, founds,
                n, global_epoch_);

      } else {
        const size_t block_size = options_.block_size;
        const size_t N = n * TILE_SIZE;
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        find_or_insert_kernel<key_type, value_type, score_type, evict_strategy,
                              TILE_SIZE><<<grid_size, block_size, 0, stream>>>(
            d_table_, table_->buckets, options_.max_bucket_size,
            table_->buckets_num, options_.dim, keys, d_table_value_addrs,
            scores, founds, param_key_index, global_epoch_, N);
      }

      sortOp.sort(n, reinterpret_cast<uintptr_t*>(d_table_value_addrs),
                  reinterpret_cast<uintptr_t*>(d_table_value_addrs_sorted),
                  param_key_index, param_key_index_sorted, stream);

      if (filter_condition) {
        const size_t block_size = options_.io_block_size;
        const size_t N = n * dim();
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        read_or_write_kernel_unlock_key<key_type, value_type, score_type, V>
            <<<grid_size, block_size, 0, stream>>>(
                d_table_value_addrs_sorted, values, founds,
                param_key_index_sorted, keys_ptr, keys, dim(), N);

      } else if (options_.io_by_cpu) {
        MultiVector<value_type*, int, bool, value_type> mv1(n, n, n, n * dim());
        const size_type host_ws_size = mv1.total_size();
        auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};
        auto host_temp_storage = host_ws.get<uint8_t*>(0);
        auto h_table_value_addrs_sorted = get_vector<0>(mv1, host_temp_storage);
        auto h_param_key_index_sorted = get_vector<1>(mv1, host_temp_storage);
        auto h_founds = get_vector<2>(mv1, host_temp_storage);
        auto h_param_values = get_vector<3>(mv1, host_temp_storage);

        CUDA_CHECK(cudaMemcpyAsync(h_table_value_addrs_sorted,
                                   d_table_value_addrs_sorted, mv1.offset(3),
                                   cudaMemcpyDeviceToHost, stream));
        CUDA_CHECK(cudaMemcpyAsync(h_param_values, values,
                                   n * sizeof(value_type) * dim(),
                                   cudaMemcpyDeviceToHost, stream));
        CUDA_CHECK(cudaStreamSynchronize(stream));

        read_or_write_by_cpu<value_type>(
            h_table_value_addrs_sorted, h_param_values,
            h_param_key_index_sorted, h_founds, dim(), n);
        CUDA_CHECK(cudaMemcpyAsync(values, h_param_values,
                                   n * sizeof(value_type) * dim(),
                                   cudaMemcpyHostToDevice, stream));
      } else {
        const size_t block_size = options_.io_block_size;
        const size_t N = n * dim();
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        read_or_write_kernel<key_type, value_type, score_type>
            <<<grid_size, block_size, 0, stream>>>(
                d_table_value_addrs_sorted, values, founds,
                param_key_index_sorted, dim(), N);
      }
    }

    CudaCheckError();
  }

  /**
   * @brief Searches the hash table for the specified keys and returns address
   * of the values. When a key is missing, the value in @p values and @p scores
   * will be inserted.
   *
   * @warning This API returns internal addresses for high-performance but
   * thread-unsafe. The caller is responsible for guaranteeing data consistency.
   *
   * @param n The number of key-value-score tuples to search or insert.
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param values  The addresses of values to search on GPU-accessible memory
   * with shape (n).
   * @param founds The status that indicates if the keys are found on
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   * @param stream The CUDA stream that is used to execute the operation.
   * @param unique_key If all keys in the same batch are unique.
   * @param locked_key_ptrs If it isn't nullptr then the keys in the table will
   * be locked, and key's address will write to locked_key_ptrs. Using
   * unlock_keys to unlock these keys.
   *
   */
  void find_or_insert(const size_type n, const key_type* keys,  // (n)
                      value_type** values,                      // (n)
                      bool* founds,                             // (n)
                      score_type* scores = nullptr,             // (n)
                      cudaStream_t stream = 0, bool unique_key = true,
                      bool ignore_evict_strategy = false,
                      key_type** locked_key_ptrs = nullptr) {
    MERLIN_CHECK(
        !is_memory_mode(),
        "[MEMORY_MODE] find_or_insert() is not supported in dual-bucket mode. "
        "Use insert_or_assign() and find() separately.");
    if (n == 0) {
      return;
    }

    while (!reach_max_capacity_ &&
           fast_load_factor(n, stream) > options_.max_load_factor) {
      reserve(capacity() * 2, stream);
    }

    if (!ignore_evict_strategy) {
      check_evict_strategy(scores);
    }

    std::unique_ptr<insert_unique_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<insert_unique_lock>(mutex_, stream);
    }

    constexpr uint32_t MinBucketCapacityFilter = sizeof(VecD_Load) / sizeof(D);

    if (locked_key_ptrs != nullptr) {
      if (!unique_key || options_.max_bucket_size < MinBucketCapacityFilter) {
        throw std::invalid_argument(
            "unique_key should be true and max_bucket_size should be larger.");
      }

      constexpr uint32_t BLOCK_SIZE = 128U;
      find_or_insert_ptr_kernel_lock_key<key_type, value_type, score_type,
                                         BLOCK_SIZE, evict_strategy>
          <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              table_->buckets, table_->buckets_size, table_->buckets_num,
              options_.max_bucket_size, options_.dim, keys, values, scores,
              locked_key_ptrs, n, founds, global_epoch_);
      CudaCheckError();
      return;
    }

    if (unique_key && options_.max_bucket_size >= MinBucketCapacityFilter) {
      constexpr uint32_t BLOCK_SIZE = 128U;

      const size_type dev_ws_size{n * sizeof(key_type**)};
      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
      auto keys_ptr{dev_ws.get<key_type**>(0)};
      CUDA_CHECK(cudaMemsetAsync(keys_ptr, 0, dev_ws_size, stream));

      find_or_insert_ptr_kernel_lock_key<key_type, value_type, score_type,
                                         BLOCK_SIZE, evict_strategy>
          <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              table_->buckets, table_->buckets_size, table_->buckets_num,
              options_.max_bucket_size, options_.dim, keys, values, scores,
              keys_ptr, n, founds, global_epoch_);

      find_or_insert_ptr_kernel_unlock_key<key_type>
          <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              keys, keys_ptr, n);
    } else {
      using Selector = SelectFindOrInsertPtrKernel<key_type, value_type,
                                                   score_type, evict_strategy>;
      static thread_local int step_counter = 0;
      static thread_local float load_factor = 0.0;

      if (((step_counter++) % kernel_select_interval_) == 0) {
        load_factor = fast_load_factor(0, stream, false);
      }
      Selector::execute_kernel(
          load_factor, options_.block_size, options_.max_bucket_size,
          table_->buckets_num, options_.dim, stream, n, d_table_,
          table_->buckets, keys, values, scores, founds, global_epoch_);
    }

    CudaCheckError();
  }

  /**
   * @brief
   * This function will lock the keys in the table and unexisted keys will be
   * ignored.
   *
   * @param n The number of keys in the table to be locked.
   * @param locked_key_ptrs The pointers of locked keys in the table with shape
   * (n).
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param success The status that indicates if the lock operation is
   * succeed.
   * @param stream The CUDA stream that is used to execute the operation.
   * @param scores The scores of the input keys will set to scores if provided.
   *
   */
  void lock_keys(const size_type n,
                 key_type const* keys,        // (n)
                 key_type** locked_key_ptrs,  // (n)
                 bool* success = nullptr,     // (n)
                 cudaStream_t stream = 0, score_type const* scores = nullptr) {
    if (n == 0) {
      return;
    }

    std::unique_ptr<insert_unique_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<insert_unique_lock>(mutex_, stream);
    }

    constexpr uint32_t MinBucketCapacityFilter = sizeof(VecD_Load) / sizeof(D);
    if (options_.max_bucket_size < MinBucketCapacityFilter) {
      throw std::runtime_error(
          "Not support lock_keys API because the bucket capacity is too "
          "small.");
    }
    constexpr uint32_t BLOCK_SIZE = 128U;
    lock_kernel_with_filter<key_type, value_type, score_type, evict_strategy>
        <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
            table_->buckets, table_->buckets_num, options_.max_bucket_size,
            options_.dim, keys, locked_key_ptrs, success, scores, global_epoch_,
            n);
    CudaCheckError();
  }

  /**
   * @brief Using pointers to address the keys in the hash table and set them
   * to target keys.
   * This function will unlock the keys in the table which are locked by
   * the previous call to find_or_insert.
   *
   * @param n The number of keys in the table to be unlocked.
   * @param locked_key_ptrs The pointers of locked keys in the table with shape
   * (n).
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param success The status that indicates if the unlock operation is
   * succeed.
   * @param stream The CUDA stream that is used to execute the operation.
   *
   */
  void unlock_keys(const size_type n, key_type** locked_key_ptrs,  // (n)
                   const key_type* keys,                           // (n)
                   bool* success = nullptr,                        // (n)
                   cudaStream_t stream = 0) {
    if (n == 0) {
      return;
    }

    std::unique_ptr<insert_unique_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<insert_unique_lock>(mutex_, stream);
    }

    constexpr uint32_t BLOCK_SIZE = 128U;
    /// TODO: check the key belongs to the bucket.
    unlock_keys_kernel<key_type>
        <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
            n, locked_key_ptrs, keys, success);
  }

  /**
   * @brief Assign new key-value-score tuples into the hash table.
   * If the key doesn't exist, the operation on the key will be ignored.
   *
   * @param n Number of key-value-score tuples to insert or assign.
   * @param keys The keys to insert on GPU-accessible memory with shape
   * (n).
   * @param values The values to insert on GPU-accessible memory with
   * shape (n, DIM).
   * @param scores The scores to insert on GPU-accessible memory with shape
   * (n).
   * @parblock
   * The scores should be a `uint64_t` value for built-in strategies. For
   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.
   * You can specify a value such as the timestamp of the key insertion or
   * number of key occurrences to perform a customized eviction strategy.
   *
   * The @p scores should be `nullptr`, when the LRU eviction strategy is
   * applied.
   * @endparblock
   *
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @param unique_key If all keys in the same batch are unique.
   */
  void assign(const size_type n,
              const key_type* keys,                // (n)
              const value_type* values,            // (n, DIM)
              const score_type* scores = nullptr,  // (n)
              cudaStream_t stream = 0, bool unique_key = true) {
    if (n == 0) {
      return;
    }

    check_evict_strategy(scores);

    std::unique_ptr<update_shared_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<update_shared_lock>(mutex_, stream);
    }

    if (is_fast_mode()) {
      static thread_local int step_counter = 0;
      static thread_local float load_factor = 0.0;

      if (((step_counter++) % kernel_select_interval_) == 0) {
        load_factor = fast_load_factor(0, stream, false);
      }
      using Selector = KernelSelector_Update<key_type, value_type, score_type,
                                             evict_strategy, ArchTag>;
      if (Selector::callable(unique_key,
                             static_cast<uint32_t>(options_.max_bucket_size),
                             static_cast<uint32_t>(options_.dim))) {
        typename Selector::Params kernelParams(
            load_factor, table_->buckets, table_->buckets_num,
            static_cast<uint32_t>(options_.max_bucket_size),
            static_cast<uint32_t>(options_.dim), keys, values, scores, n,
            global_epoch_);
        Selector::select_kernel(kernelParams, stream);
      } else {
        using Selector = SelectUpdateKernelWithIO<key_type, value_type,
                                                  score_type, evict_strategy>;
        Selector::execute_kernel(
            load_factor, options_.block_size, options_.max_bucket_size,
            table_->buckets_num, options_.dim, stream, n, d_table_,
            table_->buckets, keys, values, scores, global_epoch_);
      }
    } else {
      auto sortOp = SortPairOp<uintptr_t, int>();
      auto d_sort_bytes = sortOp.get_storage_bytes(n, stream);

      MultiVector<value_type*, int, value_type*, int, key_type*, uint8_t> mv(
          n, n, n, n, n, d_sort_bytes);
      const size_type dev_ws_size = mv.total_size();
      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
      auto temp_storage = dev_ws.get<uint8_t*>(0);
      auto d_dst = get_vector<0>(mv, temp_storage);
      auto d_src_offset = get_vector<1>(mv, temp_storage);
      auto d_dst_sorted = get_vector<2>(mv, temp_storage);
      auto d_src_offset_sorted = get_vector<3>(mv, temp_storage);
      auto keys_ptr = get_vector<4>(mv, temp_storage);
      auto d_sort_storage = get_vector<5>(mv, temp_storage);
      sortOp.set_storage(reinterpret_cast<void*>(d_sort_storage));

      CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream));

      constexpr uint32_t MinBucketCapacityFilter =
          sizeof(VecD_Load) / sizeof(D);

      bool filter_condition =
          options_.max_bucket_size >= MinBucketCapacityFilter &&
          !options_.io_by_cpu && unique_key;

      if (filter_condition) {
        constexpr uint32_t BLOCK_SIZE = 128U;

        tlp_update_kernel_hybrid<key_type, value_type, score_type,
                                 evict_strategy>
            <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                table_->buckets, table_->buckets_num, options_.max_bucket_size,
                options_.dim, keys, d_dst, scores, keys_ptr, d_src_offset,
                global_epoch_, n);

      } else {
        const size_t block_size = options_.block_size;
        const size_t N = n * TILE_SIZE;
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        update_kernel<key_type, value_type, score_type, evict_strategy,
                      TILE_SIZE><<<grid_size, block_size, 0, stream>>>(
            d_table_, table_->buckets, options_.max_bucket_size,
            table_->buckets_num, options_.dim, keys, d_dst, scores,
            d_src_offset, global_epoch_, N);
      }

      sortOp.sort(n, reinterpret_cast<uintptr_t*>(d_dst),
                  reinterpret_cast<uintptr_t*>(d_dst_sorted), d_src_offset,
                  d_src_offset_sorted, stream);

      if (filter_condition) {
        const size_t block_size = options_.io_block_size;
        const size_t N = n * dim();
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        write_kernel_unlock_key<key_type, value_type, score_type>
            <<<grid_size, block_size, 0, stream>>>(values, d_dst_sorted,
                                                   d_src_offset_sorted, dim(),
                                                   keys, keys_ptr, N);

      } else if (options_.io_by_cpu) {
        MultiVector<value_type*, int, value_type> mv1(n, n, n * dim());
        const size_type host_ws_size = mv1.total_size();
        auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};
        auto host_temp_storage = host_ws.get<uint8_t*>(0);
        auto h_dst_sorted = get_vector<0>(mv1, host_temp_storage);
        auto h_src_offset_sorted = get_vector<1>(mv1, host_temp_storage);
        auto h_values = get_vector<2>(mv1, host_temp_storage);

        CUDA_CHECK(cudaMemcpyAsync(h_dst_sorted, d_dst_sorted, mv1.offset(2),
                                   cudaMemcpyDeviceToHost, stream));
        CUDA_CHECK(cudaMemcpyAsync(h_values, values,
                                   n * dim() * sizeof(value_type),
                                   cudaMemcpyDeviceToHost, stream));
        CUDA_CHECK(cudaStreamSynchronize(stream));

        write_by_cpu<value_type>(h_dst_sorted, h_values, h_src_offset_sorted,
                                 dim(), n);
      } else {
        const size_t block_size = options_.io_block_size;
        const size_t N = n * dim();
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        write_kernel<key_type, value_type, score_type>
            <<<grid_size, block_size, 0, stream>>>(
                values, d_dst_sorted, d_src_offset_sorted, dim(), N);
      }
    }

    CudaCheckError();
  }

  /**
   * @brief Assign new scores for keys.
   * If the key doesn't exist, the operation on the key will be ignored.
   *
   * @param n Number of key-score pairs to assign.
   * @param keys The keys to insert on GPU-accessible memory with shape
   * (n).
   * @parblock
   * The scores should be a `uint64_t` value for built-in strategies. For
   * `EvictStrategy::kCustomized`, `uint32_t` scores are also supported.
   * You can specify a value such as the timestamp of the key insertion or
   * number of key occurrences to perform a customized eviction strategy.
   *
   * The @p scores should be `nullptr`, when the LRU eviction strategy is
   * applied.
   * @endparblock
   *
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @param unique_key If all keys in the same batch are unique.
   */
  void assign_scores(const size_type n,
                     const key_type* keys,                // (n)
                     const score_type* scores = nullptr,  // (n)
                     cudaStream_t stream = 0, bool unique_key = true) {
    MERLIN_CHECK(
        !is_memory_mode(),
        "[MEMORY_MODE] assign_scores() is not supported in dual-bucket mode. "
        "Scores are managed by insert_or_assign() in MEMORY_MODE.");
    if (n == 0) {
      return;
    }

    check_evict_strategy(scores);

    {
      std::unique_ptr<update_shared_lock> lock_ptr;
      if (options_.api_lock) {
        lock_ptr = std::make_unique<update_shared_lock>(mutex_, stream);
      }
      static thread_local int step_counter = 0;
      static thread_local float load_factor = 0.0;

      if (((step_counter++) % kernel_select_interval_) == 0) {
        load_factor = fast_load_factor(0, stream, false);
      }
      using Selector = KernelSelector_UpdateScore<key_type, value_type,
                                                  score_type, evict_strategy>;
      if (Selector::callable(unique_key,
                             static_cast<uint32_t>(options_.max_bucket_size))) {
        typename Selector::Params kernelParams(
            load_factor, table_->buckets, table_->buckets_num,
            static_cast<uint32_t>(options_.max_bucket_size), keys, scores, n,
            global_epoch_);
        Selector::select_kernel(kernelParams, stream);
      } else {
        using Selector = SelectUpdateScoreKernel<key_type, value_type,
                                                 score_type, evict_strategy>;
        Selector::execute_kernel(load_factor, options_.block_size,
                                 options_.max_bucket_size, table_->buckets_num,
                                 stream, n, d_table_, table_->buckets, keys,
                                 scores, global_epoch_);
      }
    }

    CudaCheckError();
  }

  /**
   * @brief Alias of `assign_scores`.
   */
  void assign(const size_type n,
              const key_type* keys,                // (n)
              const score_type* scores = nullptr,  // (n)
              cudaStream_t stream = 0, bool unique_key = true) {
    assign_scores(n, keys, scores, stream, unique_key);
  }

  /**
   * @brief Assign new values for each keys .
   * If the key doesn't exist, the operation on the key will be ignored.
   *
   * @param n Number of key-value pairs to assign.
   * @param keys The keys need to be operated, which must be on GPU-accessible
   * memory with shape (n).
   * @param values The values need to be updated, which must be on
   * GPU-accessible memory with shape (n, DIM).
   *
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @param unique_key If all keys in the same batch are unique.
   */
  void assign_values(const size_type n,
                     const key_type* keys,      // (n)
                     const value_type* values,  // (n, DIM)
                     cudaStream_t stream = 0, bool unique_key = true) {
    MERLIN_CHECK(
        !is_memory_mode(),
        "[MEMORY_MODE] assign_values() is not supported in dual-bucket mode. "
        "Use insert_or_assign() to update values in MEMORY_MODE.");
    if (n == 0) {
      return;
    }

    std::unique_ptr<update_shared_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<update_shared_lock>(mutex_, stream);
    }

    if (is_fast_mode()) {
      static thread_local int step_counter = 0;
      static thread_local float load_factor = 0.0;

      if (((step_counter++) % kernel_select_interval_) == 0) {
        load_factor = fast_load_factor(0, stream, false);
      }
      using Selector = KernelSelector_UpdateValues<key_type, value_type,
                                                   score_type, ArchTag>;
      if (Selector::callable(unique_key,
                             static_cast<uint32_t>(options_.max_bucket_size),
                             static_cast<uint32_t>(options_.dim))) {
        typename Selector::Params kernelParams(
            load_factor, table_->buckets, table_->buckets_num,
            static_cast<uint32_t>(options_.max_bucket_size),
            static_cast<uint32_t>(options_.dim), keys, values, n);
        Selector::select_kernel(kernelParams, stream);
      } else {
        using Selector =
            SelectUpdateValuesKernelWithIO<key_type, value_type, score_type>;
        Selector::execute_kernel(load_factor, options_.block_size,
                                 options_.max_bucket_size, table_->buckets_num,
                                 options_.dim, stream, n, d_table_,
                                 table_->buckets, keys, values);
      }
    } else {
      auto sortOp = SortPairOp<uintptr_t, int>();
      auto d_sort_bytes = sortOp.get_storage_bytes(n, stream);

      MultiVector<value_type*, int, value_type*, int, key_type*, uint8_t> mv(
          n, n, n, n, n, d_sort_bytes);
      const size_type dev_ws_size = mv.total_size();
      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
      auto temp_storage = dev_ws.get<uint8_t*>(0);
      auto d_dst = get_vector<0>(mv, temp_storage);
      auto d_src_offset = get_vector<1>(mv, temp_storage);
      auto d_dst_sorted = get_vector<2>(mv, temp_storage);
      auto d_src_offset_sorted = get_vector<3>(mv, temp_storage);
      auto keys_ptr = get_vector<4>(mv, temp_storage);
      auto d_sort_storage = get_vector<5>(mv, temp_storage);
      sortOp.set_storage(reinterpret_cast<void*>(d_sort_storage));

      CUDA_CHECK(cudaMemsetAsync(d_dst, 0, dev_ws_size, stream));

      constexpr uint32_t MinBucketCapacityFilter =
          sizeof(VecD_Load) / sizeof(D);

      bool filter_condition =
          options_.max_bucket_size >= MinBucketCapacityFilter &&
          !options_.io_by_cpu && unique_key;

      if (filter_condition) {
        constexpr uint32_t BLOCK_SIZE = 128U;

        tlp_update_values_kernel_hybrid<key_type, value_type, score_type>
            <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                table_->buckets, table_->buckets_num, options_.max_bucket_size,
                options_.dim, keys, d_dst, keys_ptr, d_src_offset, n);

      } else {
        const size_t block_size = options_.block_size;
        const size_t N = n * TILE_SIZE;
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        update_values_kernel<key_type, value_type, score_type, TILE_SIZE>
            <<<grid_size, block_size, 0, stream>>>(
                d_table_, table_->buckets, options_.max_bucket_size,
                table_->buckets_num, options_.dim, keys, d_dst, d_src_offset,
                N);
      }

      sortOp.sort(n, reinterpret_cast<uintptr_t*>(d_dst),
                  reinterpret_cast<uintptr_t*>(d_dst_sorted), d_src_offset,
                  d_src_offset_sorted, stream);

      if (filter_condition) {
        const size_t block_size = options_.io_block_size;
        uint64_t total_value_size = sizeof(value_type) * dim();
        if (total_value_size % 16 == 0) {
          using VecV = byte16;
          uint64_t vec_dim = total_value_size / sizeof(VecV);
          const size_t N = n * vec_dim;
          const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

          write_kernel_unlock_key<key_type, VecV, score_type>
              <<<grid_size, block_size, 0, stream>>>(
                  reinterpret_cast<const VecV*>(values),
                  reinterpret_cast<VecV**>(d_dst_sorted), d_src_offset_sorted,
                  vec_dim, keys, keys_ptr, N);
        } else {
          const size_t N = n * dim();
          const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

          write_kernel_unlock_key<key_type, value_type, score_type>
              <<<grid_size, block_size, 0, stream>>>(values, d_dst_sorted,
                                                     d_src_offset_sorted, dim(),
                                                     keys, keys_ptr, N);
        }
      } else if (options_.io_by_cpu) {
        MultiVector<value_type*, int, value_type> mv1(n, n, n * dim());
        const size_type host_ws_size = mv1.total_size();
        auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};
        auto host_temp_storage = host_ws.get<uint8_t*>(0);
        auto h_dst_sorted = get_vector<0>(mv1, host_temp_storage);
        auto h_src_offset_sorted = get_vector<1>(mv1, host_temp_storage);
        auto h_values = get_vector<2>(mv1, host_temp_storage);

        CUDA_CHECK(cudaMemcpyAsync(h_dst_sorted, d_dst_sorted, mv1.offset(2),
                                   cudaMemcpyDeviceToHost, stream));
        CUDA_CHECK(cudaMemcpyAsync(h_values, values,
                                   n * dim() * sizeof(value_type),
                                   cudaMemcpyDeviceToHost, stream));
        CUDA_CHECK(cudaStreamSynchronize(stream));

        write_by_cpu<value_type>(h_dst_sorted, h_values, h_src_offset_sorted,
                                 dim(), n);
      } else {
        const size_t block_size = options_.io_block_size;
        const size_t N = n * dim();
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        write_kernel<key_type, value_type, score_type>
            <<<grid_size, block_size, 0, stream>>>(
                values, d_dst_sorted, d_src_offset_sorted, dim(), N);
      }
    }

    CudaCheckError();
  }

  /**
   * @brief Searches the hash table for the specified keys.
   *
   * @note When a key is missing, the value in @p values is not changed.
   *
   * @param n The number of key-value-score tuples to search.
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param values The values to search on GPU-accessible memory with
   * shape (n, DIM).
   * @param founds The status that indicates if the keys are found on
   * GPU-accessible memory with shape (n).
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   * @param stream The CUDA stream that is used to execute the operation.
   *
   */
  void find(const size_type n, const key_type* keys,  // (n)
            value_type* values,                       // (n, DIM)
            bool* founds,                             // (n)
            score_type* scores = nullptr,             // (n)
            cudaStream_t stream = 0) const {
    if (n == 0) {
      return;
    }

    CUDA_CHECK(cudaMemsetAsync(founds, 0, n * sizeof(bool), stream));

    std::unique_ptr<read_shared_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);
    }

    const uint32_t value_size = dim() * sizeof(V);

    // MEMORY_MODE: dual-bucket find (sequential b1 then b2).
    if (is_memory_mode()) {
      using DualSelector = SelectDualBucketLookupKernel<key_type, value_type,
                                                        score_type, ArchTag>;
      LookupKernelParams<key_type, value_type, score_type> lookupParams(
          table_->buckets, table_->buckets_num, static_cast<uint32_t>(dim()),
          keys, values, scores, founds, n);
      DualSelector::select_kernel(lookupParams, table_->buckets_size, stream);
      CudaCheckError();
      return;
    }

    if (is_fast_mode()) {
      using Selector = SelectPipelineLookupKernelWithIO<key_type, value_type,
                                                        score_type, ArchTag>;
      const uint32_t pipeline_max_size = Selector::max_value_size();
      // Pipeline lookup kernel only supports "bucket_size = 128".
      if (options_.max_bucket_size == 128 && value_size <= pipeline_max_size) {
        LookupKernelParams<key_type, value_type, score_type> lookupParams(
            table_->buckets, table_->buckets_num, static_cast<uint32_t>(dim()),
            keys, values, scores, founds, n);
        Selector::select_kernel(lookupParams, stream);
      } else {
        using Selector =
            SelectLookupKernelWithIO<key_type, value_type, score_type>;
        static thread_local int step_counter = 0;
        static thread_local float load_factor = 0.0;

        if (((step_counter++) % kernel_select_interval_) == 0) {
          load_factor = fast_load_factor(0, stream, false);
        }
        Selector::execute_kernel(load_factor, options_.block_size,
                                 options_.max_bucket_size, table_->buckets_num,
                                 options_.dim, stream, n, d_table_,
                                 table_->buckets, keys, values, scores, founds);
      }
    } else {
      auto sortOp = SortPairOp<uintptr_t, int>();
      auto d_sort_bytes = sortOp.get_storage_bytes(n, stream);

      MultiVector<value_type*, int, value_type*, int, uint8_t> mv(n, n, n, n,
                                                                  d_sort_bytes);
      const size_type dev_ws_size = mv.total_size();
      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
      auto temp_storage = dev_ws.get<uint8_t*>(0);
      auto src = get_vector<0>(mv, temp_storage);
      auto dst_offset = get_vector<1>(mv, temp_storage);
      auto src_sorted = get_vector<2>(mv, temp_storage);
      auto dst_offset_sorted = get_vector<3>(mv, temp_storage);
      auto d_sort_storage = get_vector<4>(mv, temp_storage);
      sortOp.set_storage(reinterpret_cast<void*>(d_sort_storage));

      CUDA_CHECK(cudaMemsetAsync(src, 0, dev_ws_size, stream));

      constexpr uint32_t MinBucketCapacityFilter =
          sizeof(VecD_Load) / sizeof(D);

      bool filter_condition =
          options_.max_bucket_size >= MinBucketCapacityFilter;

      if (filter_condition) {
        constexpr uint32_t BLOCK_SIZE = 128U;

        tlp_lookup_kernel_hybrid<key_type, value_type, score_type>
            <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                table_->buckets, table_->buckets_num, options_.max_bucket_size,
                options_.dim, keys, src, scores, dst_offset, founds, n);
      } else {
        const size_t block_size = options_.block_size;
        const size_t N = n * TILE_SIZE;
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        lookup_kernel<key_type, value_type, score_type, TILE_SIZE>
            <<<grid_size, block_size, 0, stream>>>(
                d_table_, table_->buckets, options_.max_bucket_size,
                table_->buckets_num, options_.dim, keys, src, scores, founds,
                dst_offset, N);
      }

      if (values != nullptr) {
        sortOp.sort(n, reinterpret_cast<uintptr_t*>(src),
                    reinterpret_cast<uintptr_t*>(src_sorted), dst_offset,
                    dst_offset_sorted, stream);

        const size_t block_size = options_.io_block_size;
        const size_t N = n * dim();
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        read_kernel<key_type, value_type, score_type>
            <<<grid_size, block_size, 0, stream>>>(src_sorted, values, founds,
                                                   dst_offset_sorted, dim(), N);
      }
    }

    CudaCheckError();
  }

  /**
   * @brief Searches the hash table for the specified keys.
   *
   * @note When the searched keys are not hit, missed keys/indices/size can be
   * obtained.
   *
   * @param n The number of key-value-score tuples to search.
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param values The values to search on GPU-accessible memory with
   * shape (n, DIM).
   * @param missed_keys The missed keys to search on GPU-accessible memory with
   * shape (n).
   * @param missed_indices The missed indices to search on GPU-accessible memory
   * with shape (n).
   * @param missed_size The size of `missed_keys` and `missed_indices`.
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   * @param stream The CUDA stream that is used to execute the operation.
   */
  void find(const size_type n, const key_type* keys,  // (n)
            value_type* values,                       // (n, DIM)
            key_type* missed_keys,                    // (n)
            int* missed_indices,                      // (n)
            int* missed_size,                         // scalar
            score_type* scores = nullptr,             // (n)
            cudaStream_t stream = 0) const {
    if (n == 0) {
      return;
    }

    CUDA_CHECK(cudaMemsetAsync(missed_size, 0, sizeof(*missed_size), stream));

    std::unique_ptr<read_shared_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);
    }

    const uint32_t value_size = options_.dim * sizeof(V);

    if (is_fast_mode()) {
      using Selector = SelectPipelineLookupKernelWithIO<key_type, value_type,
                                                        score_type, ArchTag>;
      const uint32_t pipeline_max_size = Selector::max_value_size();
      // Pipeline lookup kernel only supports "bucket_size = 128".
      if (options_.max_bucket_size == 128 && value_size <= pipeline_max_size) {
        LookupKernelParamsV2<key_type, value_type, score_type> lookupParams(
            table_->buckets, table_->buckets_num, static_cast<uint32_t>(dim()),
            keys, values, scores, missed_keys, missed_indices, missed_size, n);
        Selector::select_kernel(lookupParams, stream);
      } else {
        using Selector =
            SelectLookupKernelWithIOV2<key_type, value_type, score_type>;
        static thread_local int step_counter = 0;
        static thread_local float load_factor = 0.0;

        if (((step_counter++) % kernel_select_interval_) == 0) {
          load_factor = fast_load_factor(0, stream, false);
        }
        Selector::execute_kernel(load_factor, options_.block_size,
                                 options_.max_bucket_size, table_->buckets_num,
                                 options_.dim, stream, n, d_table_,
                                 table_->buckets, keys, values, scores,
                                 missed_keys, missed_indices, missed_size);
      }
    } else {
      auto sortOp = SortPairOp<uintptr_t, int>();
      auto d_sort_bytes = sortOp.get_storage_bytes(n, stream);

      MultiVector<value_type*, int, value_type*, int, uint8_t> mv(n, n, n, n,
                                                                  d_sort_bytes);
      const size_type dev_ws_size = mv.total_size();
      auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
      auto temp_storage = dev_ws.get<uint8_t*>(0);
      auto src = get_vector<0>(mv, temp_storage);
      auto dst_offset = get_vector<1>(mv, temp_storage);
      auto src_sorted = get_vector<2>(mv, temp_storage);
      auto dst_offset_sorted = get_vector<3>(mv, temp_storage);
      auto d_sort_storage = get_vector<4>(mv, temp_storage);
      sortOp.set_storage(reinterpret_cast<void*>(d_sort_storage));

      CUDA_CHECK(cudaMemsetAsync(src, 0, dev_ws_size, stream));

      constexpr uint32_t MinBucketCapacityFilter =
          sizeof(VecD_Load) / sizeof(D);

      bool filter_condition =
          options_.max_bucket_size >= MinBucketCapacityFilter;

      if (filter_condition) {
        constexpr uint32_t BLOCK_SIZE = 128U;

        tlp_lookup_kernel_hybrid<key_type, value_type, score_type>
            <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                table_->buckets, table_->buckets_num, options_.max_bucket_size,
                options_.dim, keys, src, scores, dst_offset, missed_keys,
                missed_indices, missed_size, n);
      } else {
        const size_t block_size = options_.block_size;
        const size_t N = n * TILE_SIZE;
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        lookup_kernel<key_type, value_type, score_type, TILE_SIZE>
            <<<grid_size, block_size, 0, stream>>>(
                d_table_, table_->buckets, options_.max_bucket_size,
                table_->buckets_num, options_.dim, keys, src, scores,
                missed_keys, missed_indices, missed_size, dst_offset, N);
      }

      if (values != nullptr) {
        sortOp.sort(n, reinterpret_cast<uintptr_t*>(src),
                    reinterpret_cast<uintptr_t*>(src_sorted), dst_offset,
                    dst_offset_sorted, stream);

        const size_t block_size = options_.io_block_size;
        const size_t N = n * dim();
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        read_kernel<key_type, value_type, score_type>
            <<<grid_size, block_size, 0, stream>>>(src_sorted, values,
                                                   dst_offset_sorted, dim(), N);
      }
    }

    CudaCheckError();
  }

  /**
   * @brief Searches the hash table for the specified keys and returns address
   * of the values.
   *
   * @note When a key is missing, the data in @p values won't change.
   * @warning This API returns internal addresses for high-performance but
   * thread-unsafe. The caller is responsible for guaranteeing data consistency.
   *
   * @param n The number of key-value-score tuples to search.
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param values The addresses of values to search on GPU-accessible memory
   * with shape (n).
   * @param founds The status that indicates if the keys are found on
   * GPU-accessible memory with shape (n).
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   * @param stream The CUDA stream that is used to execute the operation.
   * @param unique_key If all keys in the same batch are unique.
   *
   */
  void find(const size_type n, const key_type* keys,  // (n)
            value_type** values,                      // (n)
            bool* founds,                             // (n)
            score_type* scores = nullptr,             // (n)
            cudaStream_t stream = 0, bool unique_key = true) const {
    if (n == 0) {
      return;
    }

    std::unique_ptr<read_shared_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);
    }

    constexpr uint32_t MinBucketCapacityFilter = sizeof(VecD_Load) / sizeof(D);
    if (unique_key && options_.max_bucket_size >= MinBucketCapacityFilter) {
      // Track load factor to choose between TLP and pipelined kernels.
      static thread_local int step_counter = 0;
      static thread_local float load_factor = 0.0;
      if (((step_counter++) % kernel_select_interval_) == 0) {
        load_factor = fast_load_factor(0, stream, false);
      }

      if (load_factor > 0.875f && options_.max_bucket_size == 128) {
        // At high load factors, the TLP kernel degrades because empty-slot
        // early termination fails.  Switch to the pipelined cooperative kernel
        // which scans all 128 digests in one parallel step (32 threads/key).
        constexpr uint32_t BLOCK_SIZE = 128U;
        const size_t grid_size = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
        lookup_ptr_kernel_with_pipeline<key_type, value_type, score_type>
            <<<grid_size, BLOCK_SIZE, 0, stream>>>(
                table_->buckets, table_->buckets_num, options_.dim, keys,
                values, scores, founds, n);
      } else {
        constexpr uint32_t BLOCK_SIZE = 128U;
        tlp_lookup_ptr_kernel_with_filter<key_type, value_type, score_type,
                                          evict_strategy>
            <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
                table_->buckets, table_->buckets_num, options_.max_bucket_size,
                options_.dim, keys, values, scores, founds, n, false,
                global_epoch_);
      }
    } else {
      using Selector = SelectLookupPtrKernel<key_type, value_type, score_type>;
      static thread_local int step_counter = 0;
      static thread_local float load_factor = 0.0;

      if (((step_counter++) % kernel_select_interval_) == 0) {
        load_factor = fast_load_factor(0, stream, false);
      }

      Selector::execute_kernel(load_factor, options_.block_size,
                               options_.max_bucket_size, table_->buckets_num,
                               options_.dim, stream, n, d_table_,
                               table_->buckets, keys, values, scores, founds);
    }

    CudaCheckError();
  }

  /**
   * @brief Searches the hash table for the specified keys and returns address
   * of the values, and will update the scores.
   *
   * @note When a key is missing, the data in @p values won't change.
   * @warning This API returns internal addresses for high-performance but
   * thread-unsafe. The caller is responsible for guaranteeing data consistency.
   *
   * @param n The number of key-value-score tuples to search.
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param values The addresses of values to search on GPU-accessible memory
   * with shape (n).
   * @param founds The status that indicates if the keys are found on
   * GPU-accessible memory with shape (n).
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   * @param stream The CUDA stream that is used to execute the operation.
   * @param unique_key If all keys in the same batch are unique.
   *
   */
  void find_and_update(const size_type n, const key_type* keys,  // (n)
                       value_type** values,                      // (n)
                       bool* founds,                             // (n)
                       score_type* scores = nullptr,             // (n)
                       cudaStream_t stream = 0, bool unique_key = true) {
    if (n == 0) {
      return;
    }

    std::unique_ptr<read_shared_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);
    }

    check_evict_strategy(scores);

    constexpr uint32_t MinBucketCapacityFilter = sizeof(VecD_Load) / sizeof(D);
    if (unique_key && options_.max_bucket_size >= MinBucketCapacityFilter) {
      constexpr uint32_t BLOCK_SIZE = 128U;
      tlp_lookup_ptr_kernel_with_filter<key_type, value_type, score_type,
                                        evict_strategy>
          <<<(n + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE, 0, stream>>>(
              table_->buckets, table_->buckets_num, options_.max_bucket_size,
              options_.dim, keys, values, scores, founds, n, true,
              global_epoch_);
    } else {
      throw std::runtime_error(
          "Not support update score when keys are not unique or bucket "
          "capacity is small.");
    }

    CudaCheckError();
  }

  /**
   * @brief Checks if there are elements with key equivalent to `keys` in the
   * table.
   *
   * @param n The number of `keys` to check.
   * @param keys The keys to search on GPU-accessible memory with shape (n).
   * @param founds The result that indicates if the keys are found, and should
   * be allocated by caller on GPU-accessible memory with shape (n).
   * @param stream The CUDA stream that is used to execute the operation.
   *
   */
  void contains(const size_type n, const key_type* keys,  // (n)
                bool* founds,                             // (n)
                cudaStream_t stream = 0) const {
    MERLIN_CHECK(
        !is_memory_mode(),
        "[MEMORY_MODE] contains() is not supported in dual-bucket mode. "
        "Key may reside in either bucket.");
    if (n == 0) {
      return;
    }

    std::unique_ptr<read_shared_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);
    }

    if (options_.max_bucket_size == 128) {
      // Pipeline lookup kernel only supports "bucket_size = 128".
      using Selector = SelectPipelineContainsKernel<key_type, value_type,
                                                    score_type, ArchTag>;
      ContainsKernelParams<key_type, value_type, score_type> containsParams(
          table_->buckets, table_->buckets_num, static_cast<uint32_t>(dim()),
          keys, founds, n);
      Selector::select_kernel(containsParams, stream);
    } else {
      using Selector = SelectContainsKernel<key_type, value_type, score_type>;
      static thread_local int step_counter = 0;
      static thread_local float load_factor = 0.0;

      if (((step_counter++) % kernel_select_interval_) == 0) {
        load_factor = fast_load_factor(0, stream, false);
      }
      Selector::execute_kernel(load_factor, options_.block_size,
                               options_.max_bucket_size, table_->buckets_num,
                               options_.dim, stream, n, d_table_,
                               table_->buckets, keys, founds);
    }
    CudaCheckError();
  }

  /**
   * @brief Removes specified elements from the hash table.
   *
   * @param n The number of keys to remove.
   * @param keys The keys to remove on GPU-accessible memory.
   * @param stream The CUDA stream that is used to execute the operation.
   *
   */
  void erase(const size_type n, const key_type* keys, cudaStream_t stream = 0) {
    MERLIN_CHECK(!is_memory_mode(),
                 "[MEMORY_MODE] erase() is not supported in dual-bucket mode. "
                 "Key may reside in either bucket.");
    if (n == 0) {
      return;
    }

    std::unique_ptr<update_read_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<update_read_lock>(mutex_, stream);
    }

    {
      const size_t block_size = options_.block_size;
      const size_t N = n * TILE_SIZE;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

      remove_kernel<key_type, value_type, score_type, TILE_SIZE>
          <<<grid_size, block_size, 0, stream>>>(
              d_table_, keys, table_->buckets, table_->buckets_size,
              table_->bucket_max_size, table_->buckets_num, N);
    }

    CudaCheckError();
    return;
  }

  /**
   * @brief Erases all elements that satisfy the predicate @p pred from the
   * hash table.
   *
   * @tparam PredFunctor The predicate template <typename K, typename S>
   * function with operator signature (bool*)(const K&, const S&, const K&,
   * const threshold) that returns `true` if the element should be erased. The
   * value for @p pred should be a function with type `Pred` defined like the
   * following example:
   *
   *    ```
   *    template <class K, class S>
   *    struct EraseIfPredFunctor {
   *      __forceinline__ __device__ bool operator()(const K& key,
   *                                                 S& score,
   *                                                 const K& pattern,
   *                                                 const S& threshold) {
   *        return ((key & 0x1 == pattern) && (score < threshold));
   *      }
   *    };
   *    ```
   *
   * @param pattern The third user-defined argument to @p pred with key_type
   * type.
   * @param threshold The fourth user-defined argument to @p pred with
   * score_type type.
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @return The number of elements removed.
   *
   */
  template <template <typename, typename> class PredFunctor>
  size_type erase_if(const key_type& pattern, const score_type& threshold,
                     cudaStream_t stream = 0) {
    std::unique_ptr<update_read_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<update_read_lock>(mutex_, stream);
    }

    auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)};
    auto d_count{dev_ws.get<size_type*>(0)};

    CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(size_type), stream));

    {
      const size_t block_size = options_.block_size;
      const size_t N = table_->buckets_num;
      const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

      remove_kernel<key_type, value_type, score_type, PredFunctor>
          <<<grid_size, block_size, 0, stream>>>(
              d_table_, pattern, threshold, d_count, table_->buckets,
              table_->buckets_size, table_->bucket_max_size,
              table_->buckets_num, N);
    }

    size_type count = 0;
    CUDA_CHECK(cudaMemcpyAsync(&count, d_count, sizeof(size_type),
                               cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CudaCheckError();
    return count;
  }

  /**
   * @brief Erase the key-value-score tuples which match @tparam PredFunctor.
   * @param pred A functor with template <K, V, S> defined an operator with
   * signature:  __device__ (bool*)(const K&, const V*, const S&, const
   * cg::thread_block_tile<GroupSize>&).
   *  @param stream The CUDA stream that is used to execute the operation.
   *
   * @return The number of elements removed.
   */

  template <typename PredFunctor>
  size_type erase_if_v2(PredFunctor& pred, cudaStream_t stream = 0) {
    std::unique_ptr<update_read_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<update_read_lock>(mutex_, stream);
    }

    auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)};
    auto d_count{dev_ws.get<size_type*>(0)};

    CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(size_type), stream));

    {
      /// Search_length should be multiple of GroupSize for communication.
      uint64_t dim = table_->dim;
      uint64_t n = options_.max_capacity;
      auto kernel = [&] {
        if (dim >= 32 && n % 32 == 0) {
          return remove_kernel_v2<key_type, value_type, score_type, PredFunctor,
                                  32>;
        } else if (dim >= 16 && n % 16 == 0) {
          return remove_kernel_v2<key_type, value_type, score_type, PredFunctor,
                                  16>;
        } else if (dim >= 8 && n % 8 == 0) {
          return remove_kernel_v2<key_type, value_type, score_type, PredFunctor,
                                  8>;
        }
        return remove_kernel_v2<key_type, value_type, score_type, PredFunctor,
                                1>;
      }();
      uint64_t block_size = 128UL;
      uint64_t grid_size =
          std::min(sm_cnt_ * max_threads_per_block_ / block_size,
                   SAFE_GET_GRID_SIZE(n, block_size));
      kernel<<<grid_size, block_size, 0, stream>>>(
          n, 0, pred, table_->buckets, table_->buckets_size,
          table_->bucket_max_size, table_->dim, d_count);
    }

    size_type count = 0;
    CUDA_CHECK(cudaMemcpyAsync(&count, d_count, sizeof(size_type),
                               cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CudaCheckError();
    return count;
  }

  /**
   * @brief Removes all of the elements in the hash table with no release
   * object.
   */
  void clear(cudaStream_t stream = 0) {
    std::unique_ptr<update_read_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<update_read_lock>(mutex_, stream);
    }

    const size_t block_size = options_.block_size;
    const size_t N = table_->buckets_num * table_->bucket_max_size;
    const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

    clear_kernel<key_type, value_type, score_type>
        <<<grid_size, block_size, 0, stream>>>(d_table_, table_->buckets, N);

    CudaCheckError();
  }

 public:
  /**
   * @brief Exports a certain number of the key-value-score tuples from the
   * hash table.
   *
   * @param n The maximum number of exported pairs.
   * @param offset The position of the key to search.
   * @param d_counter Accumulates amount of successfully exported values.
   * @param keys The keys to dump from GPU-accessible memory with shape (n).
   * @param values The values to dump from GPU-accessible memory with shape
   * (n, DIM).
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   *
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @return The number of elements dumped.
   *
   * @throw CudaException If the key-value size is too large for GPU shared
   * memory. Reducing the value for @p n is currently required if this exception
   * occurs.
   */
  void export_batch(size_type n, const size_type offset,
                    size_type* d_counter,          // (1)
                    key_type* keys,                // (n)
                    value_type* values,            // (n, DIM)
                    score_type* scores = nullptr,  // (n)
                    cudaStream_t stream = 0) const {
    std::unique_ptr<read_shared_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);
    }

    CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));
    if (offset >= table_->capacity) {
      return;
    }
    n = std::min(table_->capacity - offset, n);

    size_type shared_size;
    size_type block_size;
    std::tie(shared_size, block_size) =
        dump_kernel_shared_memory_size<K, V, S>(shared_mem_size_);

    const size_t grid_size = SAFE_GET_GRID_SIZE(n, block_size);

    dump_kernel<key_type, value_type, score_type>
        <<<grid_size, block_size, shared_size, stream>>>(
            d_table_, table_->buckets, keys, values, scores, offset, n,
            d_counter);

    CudaCheckError();
  }

  size_type export_batch(const size_type n, const size_type offset,
                         key_type* keys,                // (n)
                         value_type* values,            // (n, DIM)
                         score_type* scores = nullptr,  // (n)
                         cudaStream_t stream = 0) const {
    auto dev_ws{dev_mem_pool_->get_workspace<1>(sizeof(size_type), stream)};
    auto d_counter{dev_ws.get<size_type*>(0)};

    CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));
    export_batch(n, offset, d_counter, keys, values, scores, stream);

    size_type counter = 0;
    CUDA_CHECK(cudaMemcpyAsync(&counter, d_counter, sizeof(size_type),
                               cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaStreamSynchronize(stream));
    return counter;
  }

  /**
   * @brief Exports a certain number of the key-value-score tuples which match
   *
   * @tparam PredFunctor A functor with template <K, S> defined an operator
   * with signature:  __device__ (bool*)(const K&, S&, const K&, const S&).
   * specified condition from the hash table.
   *
   * @param n The maximum number of exported pairs.
   * The value for @p pred should be a function with type `Pred` defined like
   * the following example:
   *
   *    ```
   *    template <class K, class S>
   *    struct ExportIfPredFunctor {
   *      __forceinline__ __device__ bool operator()(const K& key,
   *                                                 S& score,
   *                                                 const K& pattern,
   *                                                 const S& threshold) {
   *        return score >= threshold;
   *      }
   *    };
   *    ```
   *
   * @param pattern The third user-defined argument to @p pred with key_type
   * type.
   * @param threshold The fourth user-defined argument to @p pred with
   * score_type type.
   * @param offset The position of the key to search.
   * @param keys The keys to dump from GPU-accessible memory with shape (n).
   * @param values The values to dump from GPU-accessible memory with shape
   * (n, DIM).
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   *
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @return The number of elements dumped.
   *
   * @throw CudaException If the key-value size is too large for GPU shared
   * memory. Reducing the value for @p n is currently required if this exception
   * occurs.
   */
  template <template <typename, typename> class PredFunctor>
  void export_batch_if(const key_type& pattern, const score_type& threshold,
                       size_type n, const size_type offset,
                       size_type* d_counter,
                       key_type* keys,                // (n)
                       value_type* values,            // (n, DIM)
                       score_type* scores = nullptr,  // (n)
                       cudaStream_t stream = 0) const {
    std::unique_ptr<read_shared_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);
    }
    CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));

    if (offset >= table_->capacity) {
      return;
    }
    n = std::min(table_->capacity - offset, n);
    if (n == 0) {
      return;
    }

    bool match_fast_cond = true;
    const size_t value_size = sizeof(V) * dim();
    auto check_tile_size = [&](int tile_size) {
      return options_.max_bucket_size % tile_size == 0 &&
             options_.max_bucket_size >= tile_size && offset % tile_size == 0 &&
             n % tile_size == 0;
    };
    auto select_tile_size = [&](auto vec) {
      using VecV = decltype(vec);
      size_t vec_dim = value_size / sizeof(VecV);
      if (vec_dim >= 32 && check_tile_size(32)) {
        return dump_kernel_v2<key_type, value_type, score_type, VecV,
                              PredFunctor, 32>;
      } else if (vec_dim >= 16 && check_tile_size(16)) {
        return dump_kernel_v2<key_type, value_type, score_type, VecV,
                              PredFunctor, 16>;
      } else if (vec_dim >= 8 && check_tile_size(8)) {
        return dump_kernel_v2<key_type, value_type, score_type, VecV,
                              PredFunctor, 8>;
      }
      match_fast_cond = false;
      return dump_kernel<key_type, value_type, score_type, PredFunctor>;
    };
    auto kernel = [&] {
      if (value_size >= sizeof(float4) * 8 &&
          value_size % sizeof(float4) == 0) {
        return select_tile_size(float4{});
      } else if (value_size >= sizeof(float2) * 8 &&
                 value_size % sizeof(float2) == 0) {
        return select_tile_size(float2{});
      } else if (value_size >= sizeof(float) * 8 &&
                 value_size % sizeof(float) == 0) {
        return select_tile_size(float{});
      } else if (value_size >= sizeof(uint16_t) * 8 &&
                 value_size % sizeof(uint16_t) == 0) {
        return select_tile_size(uint16_t{});
      }
      return select_tile_size(V{});
    }();
    size_t grid_size = 0, block_size = 0, shared_size = 0;
    if (match_fast_cond) {
      block_size = options_.block_size;
      grid_size = std::min(sm_cnt_ * max_threads_per_block_ / block_size,
                           SAFE_GET_GRID_SIZE(n, block_size));
    } else {
      const size_t score_size = scores ? sizeof(score_type) : 0;
      const size_t kvm_size =
          sizeof(key_type) + sizeof(value_type) * dim() + score_size;
      block_size = std::min(shared_mem_size_ / 2 / kvm_size, 1024UL);
      MERLIN_CHECK(
          block_size > 0,
          "[HierarchicalKV] block_size <= 0, the K-V-S size may be too large!");

      shared_size = kvm_size * block_size;
      grid_size = SAFE_GET_GRID_SIZE(n, block_size);
    }
    kernel<<<grid_size, block_size, shared_size, stream>>>(
        d_table_, table_->buckets, pattern, threshold, keys, values, scores,
        offset, n, d_counter);

    CudaCheckError();
  }

  /**
   * @brief Exports a certain number of key-value-score tuples that match a
   * given predicate.
   *
   * @tparam PredFunctor A functor type with a template signature `<K, V, S>`.
   * It should define an operator with the signature:
   * `__device__ bool operator()(const K&, const V*, const S&,
   * cg::thread_block_tile<GroupSize>&)`.
   *
   * @param pred A functor of type `PredFunctor` that defines the predicate for
   * filtering tuples.
   * @param n The maximum number of exported pairs.
   * @param offset The position of the key to search.
   * @param d_counter The number of elements dumped which is on device.
   * @param keys The keys to dump from GPU-accessible memory with shape (n).
   * @param values The values to dump from GPU-accessible memory with shape (n,
   * DIM).
   * @param scores The scores to search on GPU-accessible memory with shape (n).
   * @parblock
   * If @p scores is `nullptr`, the score for each key will not be returned.
   * @endparblock
   *
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @return void
   *
   */

  template <typename PredFunctor>
  void export_batch_if_v2(PredFunctor& pred, size_type n,
                          const size_type offset, size_type* d_counter,
                          key_type* keys,                // (n)
                          value_type* values,            // (n, DIM)
                          score_type* scores = nullptr,  // (n)
                          cudaStream_t stream = 0) const {
    std::unique_ptr<read_shared_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);
    }
    CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));

    if (offset >= table_->capacity) {
      return;
    }
    n = std::min(table_->capacity - offset, n);
    if (n == 0) {
      return;
    }

    /// Search_length should be multiple of GroupSize for communication.
    uint64_t dim = table_->dim;
    auto kernel = [&] {
      if (dim >= 32 && n % 32 == 0) {
        return dump_kernel<key_type, value_type, score_type, PredFunctor, 32>;
      } else if (dim >= 16 && n % 16 == 0) {
        return dump_kernel<key_type, value_type, score_type, PredFunctor, 16>;
      } else if (dim >= 8 && n % 8 == 0) {
        return dump_kernel<key_type, value_type, score_type, PredFunctor, 8>;
      }
      return dump_kernel<key_type, value_type, score_type, PredFunctor, 1>;
    }();
    uint64_t block_size = 128UL;
    uint64_t grid_size = std::min(sm_cnt_ * max_threads_per_block_ / block_size,
                                  SAFE_GET_GRID_SIZE(n, block_size));
    kernel<<<grid_size, block_size, 0, stream>>>(
        n, offset, pred, table_->buckets, table_->bucket_max_size, dim, keys,
        values, scores, d_counter);

    CudaCheckError();
  }

  /**
   * @brief Applies the given function to items in the range [first, last) in
   * the table.
   *
   * @tparam ExecutionFunc A functor type with a template signature `<K, V, S>`.
   * It should define an operator with the signature:
   * `__device__ void operator()(const K&, V*, S*,
   * cg::thread_block_tile<GroupSize>&)`.
   *
   * @param first The first element to which the function object will be
   * applied.
   * @param last The last element(excluding) to which the function object will
   * be applied.
   * @param f A functor of type `ExecutionFunc` that defines the predicate for
   * filtering tuples. signature:  __device__ (bool*)(const K&, const V*, const
   * S&, const cg::tiled_partition<GroupSize>&).
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @return void
   *
   */

  template <typename ExecutionFunc>
  void for_each(const size_type first, const size_type last, ExecutionFunc& f,
                cudaStream_t stream = 0) {
    std::unique_ptr<update_read_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<update_read_lock>(mutex_, stream);
    }

    if (first >= table_->capacity or last > table_->capacity or first >= last) {
      return;
    }
    uint64_t n = last - first;

    /// Search_length should be multiple of GroupSize for communication.
    uint64_t dim = table_->dim;
    auto kernel = [&] {
      if (dim >= 32 && n % 32 == 0) {
        return traverse_kernel<key_type, value_type, score_type, ExecutionFunc,
                               32>;
      } else if (dim >= 16 && n % 16 == 0) {
        return traverse_kernel<key_type, value_type, score_type, ExecutionFunc,
                               16>;
      } else if (dim >= 8 && n % 8 == 0) {
        return traverse_kernel<key_type, value_type, score_type, ExecutionFunc,
                               8>;
      }
      return traverse_kernel<key_type, value_type, score_type, ExecutionFunc,
                             1>;
    }();
    uint64_t block_size = 128UL;
    uint64_t grid_size = std::min(sm_cnt_ * max_threads_per_block_ / block_size,
                                  SAFE_GET_GRID_SIZE(n, block_size));
    kernel<<<grid_size, block_size, 0, stream>>>(n, first, f, table_->buckets,
                                                 table_->bucket_max_size, dim);

    CudaCheckError();
  }

 public:
  /**
   * @brief Indicates if the hash table has no elements.
   *
   * @param stream The CUDA stream that is used to execute the operation.
   * @return `true` if the table is empty and `false` otherwise.
   */
  bool empty(cudaStream_t stream = 0) const { return size(stream) == 0; }

  /**
   * @brief Returns the hash table size.
   *
   * @param stream The CUDA stream that is used to execute the operation.
   * @return The table size.
   */
  size_type size(cudaStream_t stream = 0) const {
    std::unique_ptr<read_shared_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);
    }

    const size_type N = table_->buckets_num;

    auto sumOp = SumOp<int, int64_t>();
    auto d_sum_bytes = sumOp.get_storage_bytes(N, stream);

    MultiVector<int64_t, uint8_t> mv(1, d_sum_bytes);
    const size_type dev_ws_size = mv.total_size();
    auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
    auto temp_storage = dev_ws.get<uint8_t*>(0);
    auto d_total_size = get_vector<0>(mv, temp_storage);
    auto d_sum_storage = get_vector<1>(mv, temp_storage);
    sumOp.set_storage(reinterpret_cast<void*>(d_sum_storage));
    sumOp.sum(N, table_->buckets_size, d_total_size, stream);

    int64_t h_total_size = 0;
    CUDA_CHECK(cudaMemcpyAsync(&h_total_size, d_total_size, sizeof(int64_t),
                               cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CudaCheckError();
    return static_cast<size_type>(h_total_size);
  }

  /**
   * @brief Returns the number of keys if meet PredFunctor.
   *
   * @param stream The CUDA stream that is used to execute the operation.
   * @return The table size match condiction of PredFunctor.
   */
  template <template <typename, typename> class PredFunctor>
  void size_if(const key_type& pattern, const score_type& threshold,
               size_type* d_counter, cudaStream_t stream = 0) const {
    std::unique_ptr<read_shared_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<read_shared_lock>(mutex_, stream);
    }
    CUDA_CHECK(cudaMemsetAsync(d_counter, 0, sizeof(size_type), stream));

    size_t grid_size = SAFE_GET_GRID_SIZE(capacity(), options_.block_size);
    grid_size = std::min(grid_size,
                         static_cast<size_t>(sm_cnt_ * max_threads_per_block_ /
                                             options_.block_size));
    size_if_kernel<key_type, value_type, score_type, PredFunctor>
        <<<grid_size, options_.block_size, 0, stream>>>(
            d_table_, table_->buckets, pattern, threshold, d_counter);
    CudaCheckError();
  }

  /**
   * @brief Returns the hash table capacity.
   *
   * @note The value that is returned might be less than the actual capacity of
   * the hash table because the hash table currently keeps the capacity to be
   * a power of 2 for performance considerations.
   *
   * @return The table capacity.
   */
  size_type capacity() const { return table_->capacity; }

  /**
   * @brief Sets the number of buckets to the number that is needed to
   * accommodate at least @p new_capacity elements without exceeding the maximum
   * load factor. This method rehashes the hash table. Rehashing puts the
   * elements into the appropriate buckets considering that total number of
   * buckets has changed.
   *
   * @note If the value of @p new_capacity or double of @p new_capacity is
   * greater or equal than `options_.max_capacity`, the reserve does not perform
   * any change to the hash table.
   *
   * @param new_capacity The requested capacity for the hash table.
   * @param stream The CUDA stream that is used to execute the operation.
   */
  void reserve(const size_type new_capacity, cudaStream_t stream = 0) {
    MERLIN_CHECK(
        !is_memory_mode(),
        "[MEMORY_MODE] reserve() is not supported in dual-bucket mode. "
        "Rehash does not preserve dual-bucket mapping.");
    if (reach_max_capacity_ || new_capacity > options_.max_capacity) {
      reach_max_capacity_ = (capacity() * 2 > options_.max_capacity);
      return;
    }

    {
      std::unique_ptr<update_read_lock> lock_ptr;
      if (options_.api_lock) {
        lock_ptr = std::make_unique<update_read_lock>(mutex_, stream);
      }

      // Once we have exclusive access, make sure that pending GPU calls have
      // been processed.
      CUDA_CHECK(cudaDeviceSynchronize());

      while (capacity() < new_capacity &&
             capacity() * 2 <= options_.max_capacity) {
        double_capacity<key_type, value_type, score_type>(&table_, allocator_);
        CUDA_CHECK(cudaDeviceSynchronize());
        sync_table_configuration();

        const size_t block_size = options_.block_size;
        const size_t N = TILE_SIZE * table_->buckets_num / 2;
        const size_t grid_size = SAFE_GET_GRID_SIZE(N, block_size);

        rehash_kernel_for_fast_mode<key_type, value_type, score_type, TILE_SIZE>
            <<<grid_size, block_size, 0, stream>>>(d_table_, table_->buckets,
                                                   N);
      }
      CUDA_CHECK(cudaDeviceSynchronize());
      reach_max_capacity_ = (capacity() * 2 > options_.max_capacity);
    }
    CudaCheckError();
  }

  /**
   * @brief Returns the average number of elements per slot, that is, size()
   * divided by capacity().
   *
   * @param stream The CUDA stream that is used to execute the operation.
   *
   * @return The load factor
   */
  float load_factor(cudaStream_t stream = 0) const {
    return static_cast<float>((size(stream) * 1.0) / (capacity() * 1.0));
  }

  /**
   * @brief Set max_capacity of the table.
   *
   * @param new_max_capacity The new expecting max_capacity. It must be power
   * of 2. Otherwise it will raise an error.
   */
  void set_max_capacity(size_type new_max_capacity) {
    if (!is_power(2, new_max_capacity)) {
      throw std::invalid_argument(
          "None power-of-2 new_max_capacity is not supported.");
    }

    std::unique_ptr<update_read_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<update_read_lock>(mutex_);
    }

    if (new_max_capacity < capacity()) {
      return;
    }
    if (reach_max_capacity_) {
      reach_max_capacity_ = false;
    }
    options_.max_capacity = new_max_capacity;
  }

  /**
   * @brief Returns the dimension of the vectors.
   *
   * @return The dimension of the vectors.
   */
  size_type dim() const noexcept { return options_.dim; }

  /**
   * @brief Returns The length of each bucket.
   *
   * @return The length of each bucket.
   */
  size_type max_bucket_size() const noexcept {
    return options_.max_bucket_size;
  }

  /**
   * @brief Returns the number of buckets in the table.
   *
   * @return The number of buckets in the table.
   */
  size_type bucket_count() const noexcept { return table_->buckets_num; }

  /**
   * @brief Save keys, vectors, scores in table to file or files.
   *
   * @param file A BaseKVFile object defined the file format on host filesystem.
   * @param max_workspace_size Saving is conducted in chunks. This value denotes
   * the maximum amount of temporary memory to use when dumping the table.
   * Larger values *can* lead to higher performance.
   * @param stream The CUDA stream used to execute the operation.
   *
   * @return Number of KV pairs saved to file.
   */
  size_type save(BaseKVFile<K, V, S>* file,
                 const size_t max_workspace_size = 1L * 1024 * 1024,
                 cudaStream_t stream = 0) const {
    const size_type tuple_size{sizeof(key_type) + sizeof(score_type) +
                               sizeof(value_type) * dim()};
    MERLIN_CHECK(max_workspace_size >= tuple_size,
                 "[HierarchicalKV] max_workspace_size is smaller than a single "
                 "`key + scoredata + value` tuple! Please set a larger value!");

    size_type shared_size;
    size_type block_size;
    std::tie(shared_size, block_size) =
        dump_kernel_shared_memory_size<K, V, S>(shared_mem_size_);

    // Request exclusive access (to make sure capacity won't change anymore).
    std::unique_ptr<update_read_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr = std::make_unique<update_read_lock>(mutex_, stream);
    }

    const size_type total_size{capacity()};
    const size_type n{std::min(max_workspace_size / tuple_size, total_size)};
    const size_type grid_size{SAFE_GET_GRID_SIZE(n, block_size)};

    // Grab temporary device and host memory.
    const size_type host_ws_size{n * tuple_size};
    auto host_ws{host_mem_pool_->get_workspace<1>(host_ws_size, stream)};
    auto h_keys{host_ws.get<key_type*>(0)};
    auto h_scores{reinterpret_cast<score_type*>(h_keys + n)};
    auto h_values{reinterpret_cast<value_type*>(h_scores + n)};

    const size_type dev_ws_size{sizeof(size_type) + host_ws_size};
    auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
    auto d_count{dev_ws.get<size_type*>(0)};
    auto d_keys{reinterpret_cast<key_type*>(d_count + 1)};
    auto d_scores{reinterpret_cast<score_type*>(d_keys + n)};
    auto d_values{reinterpret_cast<value_type*>(d_scores + n)};

    // Step through table, dumping contents in batches.
    size_type total_count{0};
    for (size_type i{0}; i < total_size; i += n) {
      // Dump the next batch to workspace, and then write it to the file.
      CUDA_CHECK(cudaMemsetAsync(d_count, 0, sizeof(size_type), stream));

      dump_kernel<key_type, value_type, score_type>
          <<<grid_size, block_size, shared_size, stream>>>(
              d_table_, table_->buckets, d_keys, d_values, d_scores, i,
              std::min(total_size - i, n), d_count);

      size_type count;
      CUDA_CHECK(cudaMemcpyAsync(&count, d_count, sizeof(size_type),
                                 cudaMemcpyDeviceToHost, stream));
      CUDA_CHECK(cudaStreamSynchronize(stream));

      if (count == n) {
        CUDA_CHECK(cudaMemcpyAsync(h_keys, d_keys, host_ws_size,
                                   cudaMemcpyDeviceToHost, stream));
      } else {
        CUDA_CHECK(cudaMemcpyAsync(h_keys, d_keys, sizeof(key_type) * count,
                                   cudaMemcpyDeviceToHost, stream));
        CUDA_CHECK(cudaMemcpyAsync(h_scores, d_scores,
                                   sizeof(score_type) * count,
                                   cudaMemcpyDeviceToHost, stream));
        CUDA_CHECK(cudaMemcpyAsync(h_values, d_values,
                                   sizeof(value_type) * dim() * count,
                                   cudaMemcpyDeviceToHost, stream));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));
      file->write(count, dim(), h_keys, h_values, h_scores);
      total_count += count;
    }

    return total_count;
  }

  /**
   * @brief Load keys, vectors, scores from file to table.
   *
   * @param file An BaseKVFile defined the file format within filesystem.
   * @param max_workspace_size Loading is conducted in chunks. This value
   * denotes the maximum size of such chunks. Larger values *can* lead to higher
   * performance.
   * @param stream The CUDA stream used to execute the operation.
   *
   * @return Number of keys loaded from file.
   */
  size_type load(BaseKVFile<K, V, S>* file,
                 const size_t max_workspace_size = 1L * 1024 * 1024,
                 cudaStream_t stream = 0) {
    const size_type tuple_size{sizeof(key_type) + sizeof(score_type) +
                               sizeof(value_type) * dim()};
    MERLIN_CHECK(max_workspace_size >= tuple_size,
                 "[HierarchicalKV] max_workspace_size is smaller than a single "
                 "`key + score + value` tuple! Please set a larger value!");

    const size_type n{max_workspace_size / tuple_size};
    const size_type ws_size{n * tuple_size};

    // Grab enough host memory to hold batch data.
    auto host_ws{host_mem_pool_->get_workspace<1>(ws_size, stream)};
    auto h_keys{host_ws.get<key_type*>(0)};
    auto h_scores{reinterpret_cast<score_type*>(h_keys + n)};
    auto h_values{reinterpret_cast<value_type*>(h_scores + n)};

    // Attempt a first read.
    size_type count{file->read(n, dim(), h_keys, h_values, h_scores)};
    if (count == 0) {
      return 0;
    }

    // Grab equal amount of device memory as temporary storage.
    auto dev_ws{dev_mem_pool_->get_workspace<1>(ws_size, stream)};
    auto d_keys{dev_ws.get<key_type*>(0)};
    auto d_scores{reinterpret_cast<score_type*>(d_keys + n)};
    auto d_values{reinterpret_cast<value_type*>(d_scores + n)};

    size_type total_count{0};
    do {
      if (count == n) {
        CUDA_CHECK(cudaMemcpyAsync(d_keys, h_keys, ws_size,
                                   cudaMemcpyHostToDevice, stream));
      } else {
        CUDA_CHECK(cudaMemcpyAsync(d_keys, h_keys, sizeof(key_type) * count,
                                   cudaMemcpyHostToDevice, stream));
        CUDA_CHECK(cudaMemcpyAsync(d_scores, h_scores,
                                   sizeof(score_type) * count,
                                   cudaMemcpyHostToDevice, stream));
        CUDA_CHECK(cudaMemcpyAsync(d_values, h_values,
                                   sizeof(value_type) * dim() * count,
                                   cudaMemcpyHostToDevice, stream));
      }

      set_global_epoch(static_cast<S>(IGNORED_GLOBAL_EPOCH));
      insert_or_assign(count, d_keys, d_values, d_scores, stream, true, true);
      total_count += count;

      // Read next batch.
      CUDA_CHECK(cudaStreamSynchronize(stream));
      count = file->read(n, dim(), h_keys, h_values, h_scores);
    } while (count > 0);

    return total_count;
  }

  void set_global_epoch(const uint64_t epoch) { global_epoch_ = epoch; }

 private:
  bool is_power(size_t base, size_t n) {
    if (base < 2) {
      throw std::invalid_argument("is_power with zero base.");
    }
    while (n > 1) {
      if (n % base != 0) {
        return false;
      }
      n /= base;
    }
    return true;
  }

 private:
  inline bool is_fast_mode() const noexcept { return table_->is_pure_hbm; }

  inline bool is_memory_mode() const noexcept {
    return options_.table_mode == TableMode::kMemory;
  }

  /**
   * @brief Returns the load factor by sampling up to 1024 buckets.
   *
   * @note For performance consideration, the returned load factor is
   * inaccurate but within an error in 1% empirically which is enough for
   * capacity control. But it's not suitable for end-users.
   *
   * @param delta A hypothetical upcoming change on table size.
   * @param stream The CUDA stream used to execute the operation.
   * @param need_lock If lock is needed.
   *
   * @return The evaluated load factor
   */
  inline float fast_load_factor(const size_type delta = 0,
                                cudaStream_t stream = 0,
                                const bool need_lock = true) const {
    std::unique_ptr<read_shared_lock> lock_ptr;
    if (options_.api_lock) {
      lock_ptr =
          std::make_unique<read_shared_lock>(mutex_, std::defer_lock, stream);
      if (need_lock) {
        lock_ptr->lock();
      }
    }

    size_t N = std::min(table_->buckets_num, 1024UL);

    auto sumOp = SumOp<int, int64_t>();
    auto d_sum_bytes = sumOp.get_storage_bytes(N, stream);

    MultiVector<int64_t, uint8_t> mv(1, d_sum_bytes);
    const size_type dev_ws_size = mv.total_size();
    auto dev_ws{dev_mem_pool_->get_workspace<1>(dev_ws_size, stream)};
    auto temp_storage = dev_ws.get<uint8_t*>(0);
    auto d_total_size = get_vector<0>(mv, temp_storage);
    auto d_sum_storage = get_vector<1>(mv, temp_storage);
    sumOp.set_storage(reinterpret_cast<void*>(d_sum_storage));
    sumOp.sum(N, table_->buckets_size, d_total_size, stream);

    int64_t h_total_size = 0;
    CUDA_CHECK(cudaMemcpyAsync(&h_total_size, d_total_size, sizeof(int64_t),
                               cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CudaCheckError();
    return static_cast<float>((delta * 1.0) / (capacity() * 1.0) +
                              (h_total_size * 1.0) /
                                  (options_.max_bucket_size * N * 1.0));
  }

  inline void check_evict_strategy(const score_type* scores) {
    if (evict_strategy == EvictStrategy::kLru ||
        evict_strategy == EvictStrategy::kEpochLru) {
      MERLIN_CHECK(scores == nullptr,
                   "the scores should not be specified when running on "
                   "LRU or Epoch LRU mode.");
    }

    if (evict_strategy == EvictStrategy::kLfu ||
        evict_strategy == EvictStrategy::kEpochLfu) {
      MERLIN_CHECK(scores != nullptr,
                   "the scores should be specified when running on "
                   "LFU or Epoch LFU mode.");
    }

    if (evict_strategy == EvictStrategy::kCustomized) {
      MERLIN_CHECK(scores != nullptr,
                   "the scores should be specified when running on "
                   "customized mode.");
    }

    if ((evict_strategy == EvictStrategy::kEpochLru ||
         evict_strategy == EvictStrategy::kEpochLfu)) {
      MERLIN_CHECK(
          global_epoch_ != static_cast<S>(IGNORED_GLOBAL_EPOCH),
          "the global_epoch is invalid and should be assigned by calling "
          "`set_global_epoch` when running on "
          "Epoch LRU or Epoch LFU mode.");
    }
  }

  /**
   * @brief Synchronize the TableCore struct to replicas.
   *
   * @note For performance consideration, synchronize the TableCore struct to
   * its replicas in constant memory and device memory when it's changed.
   */
  inline void sync_table_configuration() {
    CUDA_CHECK(
        cudaMemcpy(d_table_, table_, sizeof(TableCore), cudaMemcpyDefault));
  }

 private:
  HashTableOptions options_;
  TableCore* table_ = nullptr;
  TableCore* d_table_ = nullptr;
  size_t shared_mem_size_ = 0;
  int sm_cnt_ = 0;
  int max_threads_per_block_ = 0;
  std::atomic_bool reach_max_capacity_{false};
  bool initialized_ = false;
  mutable group_shared_mutex mutex_;
  const unsigned int kernel_select_interval_ = 7;
  std::unique_ptr<DeviceMemoryPool> dev_mem_pool_;
  std::unique_ptr<HostMemoryPool> host_mem_pool_;
  allocator_type* allocator_;
  ThrustAllocator<uint8_t> thrust_allocator_;
  bool default_allocator_ = true;
  std::atomic<uint64_t> global_epoch_{
      static_cast<uint64_t>(IGNORED_GLOBAL_EPOCH)};
};

}  // namespace merlin
}  // namespace nv


================================================
FILE: include/merlin_localfile.hpp
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <stddef.h>
#include <stdio.h>
#include <string>
#include "merlin/types.cuh"

namespace nv {
namespace merlin {

/**
 * The KV file on local file system. It only save/load keys and vectors
 * between table and file. `scores` are ignored in it since absolute
 * values of scores are commonly time-variant, while the time interval
 * between save/load calling is not deterministic, in default case. If
 * other specified rules are required, the BaseKVFile could be inherited
 * to implement customized read/write rules. The LocalKVFile uses compact,
 * consecutive binary format, where keys, values, and scores are stored in
 * seperated paths.
 *
 * @tparam K The data type of the key.
 * @tparam V The data type of the vector's elements.
 *         The item data type should be a basic data type of C++/CUDA.
 * @tparam S The data type for `score`.
 *           The currently supported data type is only `uint64_t`.
 *
 */
template <class K, class V, class M>
class LocalKVFile : public BaseKVFile<K, V, M> {
 public:
  LocalKVFile() : keys_fp_(nullptr), values_fp_(nullptr), scores_fp_(nullptr) {}

  ~LocalKVFile() { close(); }

  /**
   * @brief Open the file from local path. A LocalKVFile can only be
   * read or written when it stays opened.
   *
   * @param keys_path Path to file to store keys.
   * @param values_path Path to file to store values.
   * @param scores_path Path to file to store scores.
   * @params mode The mode to the file. The mode follows glibc style
   *              and behavior like fopen.
   */
  bool open(const std::string& keys_path, const std::string& values_path,
            const std::string& scores_path, const char* mode) {
    close();
    keys_fp_ = fopen(keys_path.c_str(), mode);
    if (!keys_fp_) {
      return false;
    }
    values_fp_ = fopen(values_path.c_str(), mode);
    if (!values_fp_) {
      close();
      return false;
    }
    scores_fp_ = fopen(scores_path.c_str(), mode);
    if (!scores_fp_) {
      close();
      return false;
    }
    return true;
  }

  /**
   * @brief Close the file from open status and release fd(s) on files
   * of keys, values, and scores.
   */
  void close() noexcept {
    if (keys_fp_) {
      fclose(keys_fp_);
      keys_fp_ = nullptr;
    }
    if (values_fp_) {
      fclose(values_fp_);
      values_fp_ = nullptr;
    }
    if (scores_fp_) {
      fclose(scores_fp_);
      scores_fp_ = nullptr;
    }
  }

  /**
   * Read from file and fill into the keys, values, and scores buffer.
   * When calling save/load method from table, it can assume that the
   * received buffer of keys, vectors, and scores are automatically
   * pre-allocated.
   *
   * @param n The number of KV pairs expect to read. `int64_t` was used
   *          here to adapt to various filesytem and formats.
   * @param dim The dimension of the `vectors`.
   * @param keys The pointer to received buffer for keys.
   * @param vectors The pointer to received buffer for vectors.
   * @param scores The pointer to received buffer for scores.
   *
   * @return Number of KV pairs have been successfully read.
   */
  size_t read(const size_t n, const size_t dim, K* keys, V* vectors,
              M* scores) override {
    size_t nread_keys =
        fread(keys, sizeof(K), static_cast<size_t>(n), keys_fp_);
    size_t nread_vecs =
        fread(vectors, sizeof(V) * dim, static_cast<size_t>(n), values_fp_);
    size_t nread_scores =
        fread(scores, sizeof(M), static_cast<size_t>(n), scores_fp_);
    if (nread_keys != nread_vecs || nread_keys != nread_scores) {
      return 0;
    }
    return nread_keys;
  }

  /**
   * Write keys, values, scores from table to the file.
   *
   * @param n The number of KV pairs to be written. `int64_t` was used
   *          here to adapt to various filesytem and formats.
   * @param dim The dimension of the `vectors`.
   * @param keys The keys will be written to file.
   * @param vectors The vectors of values will be written to file.
   * @param scores The scores will be written to file.
   *
   * @return Number of KV pairs have been successfully written.
   */
  size_t write(const size_t n, const size_t dim, const K* keys,
               const V* vectors, const M* scores) override {
    size_t nwritten_keys =
        fwrite(keys, sizeof(K), static_cast<size_t>(n), keys_fp_);
    size_t nwritten_vecs =
        fwrite(vectors, sizeof(V) * dim, static_cast<size_t>(n), values_fp_);
    size_t nwritten_scores =
        fwrite(scores, sizeof(M), static_cast<size_t>(n), scores_fp_);
    if (nwritten_keys != nwritten_vecs || nwritten_keys != nwritten_scores) {
      return 0;
    }
    return nwritten_keys;
  }

 private:
  FILE* keys_fp_;
  FILE* values_fp_;
  FILE* scores_fp_;
};

}  // namespace merlin
}  // namespace nv


================================================
FILE: run_all_tests.sh
================================================
#!/bin/bash

# Usage : `bash run_all_tests.sh`

# Search for all binary files that end with "test"
files=$(find ./build/ -type f -name "*_test" -executable)

# Execute each file found
has_fail=false
for file in $files
do
    echo "Executing $file ..."
    ./$file
    if ! [ $? -eq 0 ]; then
      has_fail=true
    fi
done

if [ "$has_fail" = true ] ; then
    exit 1
fi

================================================
FILE: tests/accum_or_assign_test.cc.cu
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <gtest/gtest.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <array>
#include <iostream>
#include <thread>
#include <unordered_map>
#include <vector>
#include "merlin_hashtable.cuh"
#include "test_util.cuh"

constexpr size_t DIM = 16;
using K = uint64_t;
using V = float;
using S = uint64_t;
using TableOptions = nv::merlin::HashTableOptions;
using EvictStrategy = nv::merlin::EvictStrategy;

template <class K, class S>
struct EraseIfPredFunctor {
  __forceinline__ __device__ bool operator()(const K& key, S& score,
                                             const K& pattern,
                                             const S& threshold) {
    return (((key & 0x1u) == 0x1u) && (score > threshold));
  }
};

template <class K, class S>
struct ExportIfPredFunctor {
  __forceinline__ __device__ bool operator()(const K& key, S& score,
                                             const K& pattern,
                                             const S& threshold) {
    return score > threshold;
  }
};

void test_basic_when_full(size_t max_hbm_for_vectors, int key_start) {
  constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_accum_or_assigns;
  bool* h_found;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_accum_or_assigns, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, nullptr,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_accum_or_assigns;
  V* d_def_val;
  V** d_vectors_ptr;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,
                           d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_insert = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    table->erase(KEY_NUM, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, 0);

    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,
                           d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_reinsert = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_insert, total_size_after_reinsert);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_found));
  CUDA_CHECK(cudaFreeHost(h_accum_or_assigns));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_accum_or_assigns));
  CUDA_CHECK(cudaFree(d_def_val));
  CUDA_CHECK(cudaFree(d_vectors_ptr));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_erase_if_pred(size_t max_hbm_for_vectors, int key_start) {
  constexpr uint64_t INIT_CAPACITY = 256UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 128UL;
  constexpr uint64_t TEST_TIMES = 1;
  constexpr float true_ratio = 0.5;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;
  bool* h_accum_or_assigns;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMallocHost(&h_accum_or_assigns, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  bool* d_accum_or_assigns;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    test_util::create_keys_in_one_buckets<K, S, V, DIM>(
        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY);

    test_util::create_random_bools<K>(h_accum_or_assigns, KEY_NUM, true_ratio);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_accum_or_assigns, h_accum_or_assigns,
                          KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,
                           d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    size_t expected_size = 0;
    for (size_t i = 0; i < KEY_NUM; i++) {
      if (!h_accum_or_assigns[i]) expected_size++;
    }
    ASSERT_EQ(total_size, expected_size);

    K pattern = 100;
    S threshold = 0;
    size_t erase_num = table->template erase_if<EraseIfPredFunctor>(
        pattern, threshold, stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ((erase_num + total_size), expected_size);

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        ASSERT_EQ(h_scores[i], h_keys[i]);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, (expected_size - erase_num));

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_found));
  CUDA_CHECK(cudaFreeHost(h_accum_or_assigns));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaFree(d_accum_or_assigns));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_rehash(size_t max_hbm_for_vectors, int key_start) {
  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
  constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE;
  constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = BUCKET_MAX_SIZE * 2;
  constexpr uint64_t TEST_TIMES = 100;
  constexpr float true_ratio = 0.5;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;
  bool* h_accum_or_assigns;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMallocHost(&h_accum_or_assigns, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  bool* d_accum_or_assigns;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    test_util::create_random_bools<K>(h_accum_or_assigns, KEY_NUM, true_ratio);
    test_util::create_keys_in_one_buckets<K, S, V, DIM>(
        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_accum_or_assigns, h_accum_or_assigns,
                          KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,
                           d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaDeviceSynchronize());

    size_t expected_size = 0;
    for (size_t i = 0; i < KEY_NUM; i++) {
      if (!h_accum_or_assigns[i]) expected_size++;
    }
    ASSERT_EQ(total_size, expected_size);

    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                       d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(dump_counter, expected_size);

    table->reserve(MAX_CAPACITY, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(table->capacity(), MAX_CAPACITY);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, expected_size);

    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        ASSERT_EQ(h_scores[i], h_keys[i]);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, expected_size);

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_found));
  CUDA_CHECK(cudaFreeHost(h_accum_or_assigns));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaFree(d_accum_or_assigns));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_rehash_on_big_batch(size_t max_hbm_for_vectors, int key_start) {
  constexpr uint64_t INIT_CAPACITY = 1024;
  constexpr uint64_t MAX_CAPACITY = 16 * 1024;
  constexpr uint64_t INIT_KEY_NUM = 1024;
  constexpr uint64_t KEY_NUM = 2048;

  std::unordered_map<K, float> expected_values;
  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;
  bool* h_accum_or_assigns;
  bool* h_accum_or_assigns_init;
  float true_ratio = 0.6f;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = 128;
  options.max_load_factor = 0.6;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMallocHost(&h_accum_or_assigns, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMallocHost(&h_accum_or_assigns_init, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_accum_or_assigns;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  uint64_t expected_size = 0;
  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMemset(d_accum_or_assigns, 0, KEY_NUM * sizeof(bool)));

  total_size = table->size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(total_size, 0);

  test_util::create_random_bools<K>(h_accum_or_assigns, INIT_KEY_NUM,
                                    true_ratio);
  CUDA_CHECK(cudaMemcpy(d_accum_or_assigns, h_accum_or_assigns,
                        INIT_KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));

  table->accum_or_assign(INIT_KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,
                         d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_GE(table->capacity(), INIT_CAPACITY * 2);

  expected_size = 0;
  for (int i = 0; i < INIT_KEY_NUM; i++)
    expected_size += (h_accum_or_assigns[i] ? 0 : 1);

  total_size = table->size(stream);
  CUDA_CHECK(cudaDeviceSynchronize());
  ASSERT_EQ(total_size, expected_size);
  ASSERT_EQ(table->capacity(), (INIT_CAPACITY * 2));

  CUDA_CHECK(cudaMemcpy(h_accum_or_assigns_init, h_accum_or_assigns,
                        KEY_NUM * sizeof(bool), cudaMemcpyHostToHost));
  test_util::create_random_bools<K>(h_accum_or_assigns, KEY_NUM, true_ratio);
  CUDA_CHECK(cudaMemcpy(d_accum_or_assigns, h_accum_or_assigns,
                        KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));

  table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,
                         d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  expected_size = 0;
  for (int i = 0; i < KEY_NUM; i++) {
    if (i < INIT_KEY_NUM) {
      if (h_accum_or_assigns_init[i]) {
        if (h_accum_or_assigns[i]) {
        } else {
          expected_size++;
          expected_values[h_keys[i]] = static_cast<float>(h_keys[i] * 0.00001);
        }
      } else {
        expected_size++;
        if (h_accum_or_assigns[i]) {
          expected_values[h_keys[i]] = static_cast<float>(h_keys[i] * 0.00002);
        } else {
          expected_values[h_keys[i]] = static_cast<float>(h_keys[i] * 0.00001);
        }
      }
    }
    if (i >= INIT_KEY_NUM && (!h_accum_or_assigns[i])) {
      expected_size++;
      expected_values[h_keys[i]] = static_cast<float>(h_keys[i] * 0.00001);
    }
  }

  total_size = table->size(stream);
  CUDA_CHECK(cudaDeviceSynchronize());
  ASSERT_EQ(total_size, expected_size);
  ASSERT_GE(table->capacity(), KEY_NUM * 2);

  dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                     d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(dump_counter, expected_size);

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
  table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int found_num = 0;

  CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(
      cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K), cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                        cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));
  for (int i = 0; i < KEY_NUM; i++) {
    if (h_found[i]) {
      found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j], expected_values[h_keys[i]]);
      }
    }
  }
  ASSERT_EQ(found_num, expected_size);

  table->clear(stream);
  total_size = table->size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(total_size, 0);
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_accum_or_assigns));
  CUDA_CHECK(cudaFreeHost(h_accum_or_assigns_init));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_accum_or_assigns));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}
//
// void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) {
//  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
//  constexpr uint64_t INIT_CAPACITY = 4 * 1024;
//  constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;
//  constexpr uint64_t KEY_NUM = 256;
//  constexpr uint64_t THREAD_N = 8;
//
//  std::vector<std::thread> threads;
//
//  TableOptions options;
//
//  options.init_capacity = INIT_CAPACITY;
//  options.max_capacity = MAX_CAPACITY;
//  options.dim = DIM;
//  options.max_load_factor = 0.50f;
//  options.max_bucket_size = BUCKET_MAX_SIZE;
//  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
//  using Table = nv::merlin::HashTable<K, V, S,
//  EvictStrategy::kLru>;
//
//  std::shared_ptr<Table> table = std::make_shared<Table>();
//  table->init(options);
//
//  auto worker_function = [&table, KEY_NUM, options](int task_n) {
//    K* h_keys;
//    V* h_vectors;
//    bool* h_found;
//
//    size_t current_capacity = table->capacity();
//
//    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
//    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
//    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));
//
//    K* d_keys;
//    V* d_vectors;
//    bool* d_found;
//
//    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
//    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
//    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
//
//    cudaStream_t stream;
//    CUDA_CHECK(cudaStreamCreate(&stream));
//
//    while (table->capacity() < MAX_CAPACITY) {
//      test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,
//                                                  KEY_NUM);
//      CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
//                            cudaMemcpyHostToDevice));
//      CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
//                            KEY_NUM * sizeof(V) * options.dim,
//                            cudaMemcpyHostToDevice));
//      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
//
//      table->accum_or_assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);
//      CUDA_CHECK(cudaStreamSynchronize(stream));
//
//      CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
//      table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);
//
//      CUDA_CHECK(cudaStreamSynchronize(stream));
//      int found_num = 0;
//
//      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
//      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
//      CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
//                            cudaMemcpyDeviceToHost));
//      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
//                            cudaMemcpyDeviceToHost));
//
//      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
//                            KEY_NUM * sizeof(V) * options.dim,
//                            cudaMemcpyDeviceToHost));
//      for (int i = 0; i < KEY_NUM; i++) {
//        if (h_found[i]) {
//          found_num++;
//          for (int j = 0; j < options.dim; j++) {
//            ASSERT_EQ(h_vectors[i * options.dim + j],
//                      static_cast<float>(h_keys[i] * 0.00001));
//          }
//        }
//      }
//      ASSERT_EQ(found_num, KEY_NUM);
//      if (task_n == 0 && current_capacity != table->capacity()) {
//        std::cout << "[test_dynamic_rehash_on_multi_threads] The capacity "
//                     "changed from "
//                  << current_capacity << " to " << table->capacity()
//                  << std::endl;
//        current_capacity = table->capacity();
//      }
//      CUDA_CHECK(cudaStreamSynchronize(stream));
//    }
//    CUDA_CHECK(cudaStreamDestroy(stream));
//
//    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
//                          KEY_NUM * sizeof(V) * options.dim,
//                          cudaMemcpyDeviceToHost));
//
//    CUDA_CHECK(cudaFreeHost(h_keys));
//    CUDA_CHECK(cudaFreeHost(h_found));
//    CUDA_CHECK(cudaFreeHost(h_vectors));
//
//    CUDA_CHECK(cudaFree(d_keys));
//    CUDA_CHECK(cudaFree(d_vectors));
//    CUDA_CHECK(cudaFree(d_found));
//    CUDA_CHECK(cudaDeviceSynchronize());
//
//    CudaCheckError();
//  };
//
//  for (int i = 0; i < THREAD_N; ++i)
//    threads.emplace_back(std::thread(worker_function, i));
//
//  for (auto& th : threads) {
//    th.join();
//  }
//  ASSERT_EQ(table->capacity(), MAX_CAPACITY);
//}
//
void test_export_batch_if(size_t max_hbm_for_vectors, int key_start) {
  constexpr uint64_t INIT_CAPACITY = 256UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 128UL;
  constexpr uint64_t TEST_TIMES = 1;
  constexpr float true_ratio = 0.6;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_accum_or_assigns;
  size_t h_dump_counter = 0;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_accum_or_assigns, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  bool* d_accum_or_assigns;
  size_t* d_dump_counter;
  int found_num = 0;
  bool* h_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));

  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  K pattern = 100;
  S threshold = test_util::host_nano<S>(stream);

  for (int i = 0; i < TEST_TIMES; i++) {
    test_util::create_random_bools<K>(h_accum_or_assigns, KEY_NUM, true_ratio);
    test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                                KEY_NUM);

    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_accum_or_assigns, h_accum_or_assigns,
                          KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,
                           nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    size_t expected_size = 0;
    for (size_t i = 0; i < KEY_NUM; i++) {
      if (!h_accum_or_assigns[i]) expected_size++;
    }

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, expected_size);

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    found_num = 0;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, expected_size);

    table->template export_batch_if<ExportIfPredFunctor>(
        pattern, threshold, table->capacity(), 0, d_dump_counter, d_keys,
        d_vectors, d_scores, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, h_dump_counter * sizeof(S),
                          cudaMemcpyDeviceToHost));

    size_t expected_export_count = 0;
    for (int i = 0; i < h_dump_counter; i++) {
      if (h_scores[i] > threshold) expected_export_count++;
    }
    ASSERT_EQ(expected_export_count, h_dump_counter);

    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < h_dump_counter; i++) {
      ASSERT_GT(h_scores[i], threshold);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_found));
  CUDA_CHECK(cudaFreeHost(h_accum_or_assigns));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaFree(d_accum_or_assigns));
  CUDA_CHECK(cudaFree(d_dump_counter));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;
  constexpr float true_ratio = 0.5;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);
  std::vector<uint8_t> h_accum_or_assigns_base(BASE_KEY_NUM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);
  std::vector<uint8_t> h_accum_or_assigns_test(TEST_KEY_NUM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;
  bool* d_accum_or_assigns_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns_temp, TEMP_KEY_NUM * sizeof(bool)));

  test_util::create_random_bools<K>(
      reinterpret_cast<bool*>(h_accum_or_assigns_base.data()), BASE_KEY_NUM,
      true_ratio);
  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  test_util::create_random_bools<K>(
      reinterpret_cast<bool*>(h_accum_or_assigns_test.data()), TEST_KEY_NUM,
      true_ratio);
  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_accum_or_assigns_base[72] = false;
  h_accum_or_assigns_base[73] = false;

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  h_accum_or_assigns_test[2] = true;
  h_accum_or_assigns_test[3] = false;

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(
          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_base.data(),
                     BASE_KEY_NUM * sizeof(uint8_t), cudaMemcpyHostToDevice));
      S start_ts = test_util::host_nano<S>(stream);
      table->accum_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                             d_accum_or_assigns_temp, nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts = test_util::host_nano<S>(stream);

      size_t total_size = table->size(stream);
      size_t expected_size = 0;
      for (int i = 0; i < BASE_KEY_NUM; i++) {
        if (!h_accum_or_assigns_base[i]) expected_size++;
      }
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, expected_size);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, expected_size);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        ASSERT_GE(h_scores_temp[i], start_ts);
        ASSERT_LE(h_scores_temp[i], end_ts);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(
          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_test.data(),
                     TEST_KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));
      S start_ts = test_util::host_nano<S>(stream);
      table->accum_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                             d_accum_or_assigns_temp, nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts = test_util::host_nano<S>(stream);

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t expected_size = 0;
      for (int i = 0; i < BASE_KEY_NUM; i++) {
        if (!h_accum_or_assigns_base[i]) expected_size++;
      }

      for (int i = 0; i < TEST_KEY_NUM; i++) {
        if ((h_keys_base.end() == std::find(h_keys_base.begin(),
                                            h_keys_base.end(),
                                            h_keys_test[i])) &&
            !h_accum_or_assigns_test[i])
          expected_size++;
      }
      ASSERT_EQ(total_size, expected_size);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, expected_size);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        bool is_accum = (h_keys_temp[i] == h_keys_test[2]);
        bool is_new_insert =
            (h_keys_test.end() !=
             std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]));
        if (is_accum) {
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                      static_cast<float>(h_keys_temp[i] * 0.00002));
          }
        } else {
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                      static_cast<float>(h_keys_temp[i] * 0.00001));
          }
        }
        if (is_accum || (is_new_insert && (h_keys_temp[i] != h_keys_test[3]))) {
          ASSERT_GE(h_scores_temp[i], start_ts);
          ASSERT_LE(h_scores_temp[i], end_ts);
        } else {
          ASSERT_LE(h_scores_temp[i], start_ts);
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));
  CUDA_CHECK(cudaFree(d_accum_or_assigns_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors, int key_start) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 1024;
  constexpr float true_ratio = 0.5;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);
  std::vector<uint8_t> h_accum_or_assigns_base(BASE_KEY_NUM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);
  std::vector<uint8_t> h_accum_or_assigns_test(TEST_KEY_NUM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;
  bool* d_accum_or_assigns_temp;

  int freq_range = 1000;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns_temp, TEMP_KEY_NUM * sizeof(bool)));

  test_util::create_random_bools<K>(
      reinterpret_cast<bool*>(h_accum_or_assigns_base.data()), BASE_KEY_NUM,
      true_ratio);

  test_util::create_random_bools<K>(
      reinterpret_cast<bool*>(h_accum_or_assigns_test.data()), TEST_KEY_NUM,
      true_ratio);

  for (int i = 0; i < TEST_TIMES; i++) {
    test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
        h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
        BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,
        freq_range);

    test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
        h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
        TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
        0xFFFFFFFFFFFFFFFD, freq_range);

    h_accum_or_assigns_base[72] = false;
    h_accum_or_assigns_base[73] = false;

    h_keys_test[2] = h_keys_base[72];
    h_keys_test[3] = h_keys_base[73];

    h_accum_or_assigns_test[2] = true;
    h_accum_or_assigns_test[3] = false;

    h_scores_test[2] = h_keys_base[72] % freq_range;
    h_scores_test[3] = h_keys_base[73] % freq_range;

    for (int i = 0; i < options.dim; i++) {
      h_vectors_test[2 * options.dim + i] =
          h_vectors_base[72 * options.dim + i];
      h_vectors_test[3 * options.dim + i] =
          h_vectors_base[73 * options.dim + i];
    }
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    size_t total_size = 0;
    size_t dump_counter = 0;
    S global_epoch = 1;
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(
          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_base.data(),
                     BASE_KEY_NUM * sizeof(uint8_t), cudaMemcpyHostToDevice));

      table->set_global_epoch(global_epoch);
      table->accum_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                             d_accum_or_assigns_temp, d_scores_temp, stream);

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      size_t expected_size = 0;
      for (int i = 0; i < BASE_KEY_NUM; i++) {
        if (!h_accum_or_assigns_base[i]) expected_size++;
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, expected_size);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, expected_size);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        ASSERT_EQ(h_scores_temp[i], h_keys_temp[i] % freq_range);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(
          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_test.data(),
                     TEST_KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));

      table->set_global_epoch(global_epoch);
      table->accum_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                             d_accum_or_assigns_temp, d_scores_temp, stream);

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      size_t expected_size = 0;
      for (int i = 0; i < BASE_KEY_NUM; i++) {
        if (!h_accum_or_assigns_base[i]) expected_size++;
      }

      for (int i = 0; i < TEST_KEY_NUM; i++) {
        if ((h_keys_base.end() == std::find(h_keys_base.begin(),
                                            h_keys_base.end(),
                                            h_keys_test[i])) &&
            !h_accum_or_assigns_test[i])
          expected_size++;
      }
      ASSERT_EQ(total_size, expected_size);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, expected_size);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        bool is_accum = (h_keys_temp[i] == h_keys_test[2]);
        bool is_new_insert =
            (h_keys_test.end() !=
             std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]));

        if (is_accum) {
          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range) * 2);
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                      static_cast<float>(h_keys_temp[i] * 0.00002));
          }
        } else {
          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range));
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                      static_cast<float>(h_keys_temp[i] * 0.00001));
          }
        }
      }
    }
    CUDA_CHECK(cudaStreamDestroy(stream));
  }

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));
  CUDA_CHECK(cudaFree(d_accum_or_assigns_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors,
                                        int key_start = 0) {
  constexpr int RSHIFT_ON_NANO = 20;

  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;
  constexpr float true_ratio = 0.5;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);
  std::vector<uint8_t> h_accum_or_assigns_base(BASE_KEY_NUM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);
  std::vector<uint8_t> h_accum_or_assigns_test(TEST_KEY_NUM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;
  bool* d_accum_or_assigns_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns_temp, TEMP_KEY_NUM * sizeof(bool)));

  test_util::create_random_bools<K>(
      reinterpret_cast<bool*>(h_accum_or_assigns_base.data()), BASE_KEY_NUM,
      true_ratio);
  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  test_util::create_random_bools<K>(
      reinterpret_cast<bool*>(h_accum_or_assigns_test.data()), TEST_KEY_NUM,
      true_ratio);
  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_accum_or_assigns_base[72] = false;
  h_accum_or_assigns_base[73] = false;

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  h_accum_or_assigns_test[2] = true;
  h_accum_or_assigns_test[3] = false;

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(
          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_base.data(),
                     BASE_KEY_NUM * sizeof(uint8_t), cudaMemcpyHostToDevice));
      S start_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
      table->set_global_epoch(global_epoch);
      table->accum_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                             d_accum_or_assigns_temp, nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;

      size_t total_size = table->size(stream);
      size_t expected_size = 0;
      for (int i = 0; i < BASE_KEY_NUM; i++) {
        if (!h_accum_or_assigns_base[i]) expected_size++;
      }
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, expected_size);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, expected_size);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        ASSERT_GE(h_scores_temp[i] & 0xFFFFFFFF, start_ts);
        ASSERT_LE(h_scores_temp[i] & 0xFFFFFFFF, end_ts);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(
          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_test.data(),
                     TEST_KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));
      S start_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
      table->set_global_epoch(global_epoch);
      table->accum_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                             d_accum_or_assigns_temp, nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t expected_size = 0;
      for (int i = 0; i < BASE_KEY_NUM; i++) {
        if (!h_accum_or_assigns_base[i]) expected_size++;
      }

      for (int i = 0; i < TEST_KEY_NUM; i++) {
        if ((h_keys_base.end() == std::find(h_keys_base.begin(),
                                            h_keys_base.end(),
                                            h_keys_test[i])) &&
            !h_accum_or_assigns_test[i])
          expected_size++;
      }
      ASSERT_EQ(total_size, expected_size);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, expected_size);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        bool is_accum = (h_keys_temp[i] == h_keys_test[2]);
        bool is_new_insert =
            (h_keys_test.end() !=
             std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]));
        if (is_accum) {
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                      static_cast<float>(h_keys_temp[i] * 0.00002));
          }
        } else {
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                      static_cast<float>(h_keys_temp[i] * 0.00001));
          }
        }
        if (is_accum || (is_new_insert && (h_keys_temp[i] != h_keys_test[3]))) {
          ASSERT_GE(h_scores_temp[i] & 0xffffffff, start_ts);
          ASSERT_LE(h_scores_temp[i] & 0xffffffff, end_ts);
          ASSERT_EQ(h_scores_temp[i] >> 32 & 0xffffffff, global_epoch);
        } else {
          ASSERT_LE(h_scores_temp[i] & 0xffffffff, start_ts);
          ASSERT_EQ(h_scores_temp[i] >> 32 & 0xffffffff, global_epoch - 1);
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));
  CUDA_CHECK(cudaFree(d_accum_or_assigns_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors,
                                        int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 1024;
  constexpr float true_ratio = 0.5;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);
  std::vector<uint8_t> h_accum_or_assigns_base(BASE_KEY_NUM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);
  std::vector<uint8_t> h_accum_or_assigns_test(TEST_KEY_NUM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;
  bool* d_accum_or_assigns_temp;

  int freq_range = 1000;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns_temp, TEMP_KEY_NUM * sizeof(bool)));

  test_util::create_random_bools<K>(
      reinterpret_cast<bool*>(h_accum_or_assigns_base.data()), BASE_KEY_NUM,
      true_ratio);

  test_util::create_random_bools<K>(
      reinterpret_cast<bool*>(h_accum_or_assigns_test.data()), TEST_KEY_NUM,
      true_ratio);

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,
      freq_range);

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD, freq_range);

  h_accum_or_assigns_base[71] = false;
  h_accum_or_assigns_base[72] = false;
  h_accum_or_assigns_base[73] = false;

  // Simulate overflow of low 32bits.
  h_scores_base[71] = static_cast<S>(std::numeric_limits<uint32_t>::max() -
                                     static_cast<uint32_t>(1));

  h_keys_test[1] = h_keys_base[71];
  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  h_accum_or_assigns_test[1] = true;
  h_accum_or_assigns_test[2] = true;
  h_accum_or_assigns_test[3] = false;

  h_scores_test[1] = h_scores_base[71];
  h_scores_test[2] = h_keys_base[72] % freq_range;
  h_scores_test[3] = h_keys_base[73] % freq_range;

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[1 * options.dim + i] = h_vectors_base[71 * options.dim + i];
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(
          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_base.data(),
                     BASE_KEY_NUM * sizeof(uint8_t), cudaMemcpyHostToDevice));

      table->set_global_epoch(global_epoch);
      table->accum_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                             d_accum_or_assigns_temp, d_scores_temp, stream);

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      size_t expected_size = 0;
      for (int i = 0; i < BASE_KEY_NUM; i++) {
        if (!h_accum_or_assigns_base[i]) expected_size++;
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, expected_size);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, expected_size);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        if (h_keys_temp[i] == h_keys_base[71]) {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, h_scores_base[71]);
          ASSERT_EQ(h_scores_temp[i], expected_score);
        } else {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, (h_keys_temp[i] % freq_range));
          ASSERT_EQ(h_scores_temp[i], expected_score);
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(
          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_test.data(),
                     TEST_KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));

      table->set_global_epoch(global_epoch);
      table->accum_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                             d_accum_or_assigns_temp, d_scores_temp, stream);

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      size_t expected_size = 0;
      for (int i = 0; i < BASE_KEY_NUM; i++) {
        if (!h_accum_or_assigns_base[i]) expected_size++;
      }

      for (int i = 0; i < TEST_KEY_NUM; i++) {
        if ((h_keys_base.end() == std::find(h_keys_base.begin(),
                                            h_keys_base.end(),
                                            h_keys_test[i])) &&
            !h_accum_or_assigns_test[i])
          expected_size++;
      }
      ASSERT_EQ(total_size, expected_size);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, expected_size);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      ASSERT_TRUE(h_keys_temp.end() != std::find(h_keys_temp.begin(),
                                                 h_keys_temp.end(),
                                                 h_keys_base[71]));

      for (int i = 0; i < dump_counter; i++) {
        bool in_base =
            h_keys_base.end() !=
            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);

        bool is_accum = (h_keys_temp[i] == h_keys_test[1] ||
                         h_keys_temp[i] == h_keys_test[2]);
        bool is_new_insert =
            (h_keys_test.end() !=
             std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]));

        if (is_accum) {
          if (h_keys_temp[i] == h_keys_base[71]) {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch, h_scores_base[71] * 2);
            ASSERT_EQ(h_scores_temp[i], expected_score);
          } else {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch, (h_keys_temp[i] % freq_range) * 2);
            ASSERT_EQ(h_scores_temp[i], expected_score);
          }
        } else {
          if (h_keys_temp[i] == h_keys_base[71]) {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch, h_scores_base[71] * 2);
            ASSERT_EQ(h_scores_temp[i], expected_score);
          } else {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch - static_cast<S>(in_base),
                (h_keys_temp[i] % freq_range));

            ASSERT_EQ(h_scores_temp[i], expected_score);
          }
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] *
                                       (is_accum ? 0.00002 : 0.00001)))
              << ",i=" << i << ",is_accum=" << is_accum;
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));
  CUDA_CHECK(cudaFree(d_accum_or_assigns_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors,
                                          int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 128;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;
  constexpr float true_ratio = 0.3;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);
  std::vector<uint8_t> h_accum_or_assigns_base(BASE_KEY_NUM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);
  std::vector<uint8_t> h_accum_or_assigns_test(TEST_KEY_NUM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);
  std::vector<uint8_t> h_found_temp(TEMP_KEY_NUM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;
  bool* d_accum_or_assigns_temp;
  bool* d_found_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns_temp, TEMP_KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found_temp, TEMP_KEY_NUM * sizeof(bool)));

  test_util::create_random_bools<K>(
      reinterpret_cast<bool*>(h_accum_or_assigns_base.data()), BASE_KEY_NUM,
      true_ratio);
  test_util::create_random_bools<K>(
      reinterpret_cast<bool*>(h_accum_or_assigns_test.data()), TEST_KEY_NUM,
      true_ratio);

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  const S base_score_start = 1000;
  for (int i = 0; i < BASE_KEY_NUM; i++) {
    h_scores_base[i] = base_score_start + i;
  }

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);
  const S test_score_start = base_score_start + BASE_KEY_NUM;
  for (int i = 0; i < TEST_KEY_NUM; i++) {
    h_scores_test[i] = test_score_start + i;
  }
  for (int i = 64; i < TEST_KEY_NUM; i++) {
    h_keys_test[i] = h_keys_base[i];
    //    h_scores_test[i] = h_scores_base[i];
    for (int j = 0; j < options.dim; j++) {
      h_vectors_test[i * options.dim + j] = h_vectors_base[i * options.dim + j];
    }
  }

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(
          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_base.data(),
                     BASE_KEY_NUM * sizeof(uint8_t), cudaMemcpyHostToDevice));
      table->accum_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                             d_accum_or_assigns_temp, d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      size_t expected_size = 0;
      for (int i = 0; i < BASE_KEY_NUM; i++) {
        if (!h_accum_or_assigns_base[i]) expected_size++;
      }
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, expected_size);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, expected_size);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        S expected_score = 0ul;
        bool is_accum = false;
        for (int j = 0; j < BASE_KEY_NUM; j++) {
          if (h_keys_base[j] == h_keys_temp[i]) {
            expected_score = h_scores_base[j];
            is_accum = h_accum_or_assigns_base[j];
          }
        }
        ASSERT_FALSE(is_accum);
        ASSERT_EQ(expected_score, h_scores_temp[i]);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(
          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_test.data(),
                     TEST_KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));
      table->find(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_found_temp,
                  nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      CUDA_CHECK(cudaMemcpy(h_found_temp.data(), d_found_temp,
                            TEST_KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));

      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->accum_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                             d_accum_or_assigns_temp, d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t expected_size = 0;
      for (int i = 0; i < BASE_KEY_NUM; i++) {
        if (!h_accum_or_assigns_base[i]) expected_size++;
      }

      for (int i = 0; i < TEST_KEY_NUM; i++) {
        if ((h_keys_base.end() == std::find(h_keys_base.begin(),
                                            h_keys_base.end(),
                                            h_keys_test[i])) &&
            !h_accum_or_assigns_test[i])
          expected_size++;
      }
      expected_size = std::min(expected_size, BUCKET_MAX_SIZE);

      // Some keys in base could be evicted in one operation that allows the
      // same key with `assign` flag in the test can be inserted.
      ASSERT_GE(total_size, expected_size);
      ASSERT_LE(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, total_size);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        S expected_score = 888ul;
        S base_score = 888ul;
        S test_score = 888ul;
        bool is_accum_test = false;
        bool is_found_on_base = false;
        for (int j = 0; j < BASE_KEY_NUM; j++) {
          if (h_keys_base[j] == h_keys_temp[i]) {
            is_found_on_base = !h_accum_or_assigns_base[j];
            base_score = h_scores_base[j];
            break;
          }
        }
        for (int j = 0; j < TEST_KEY_NUM; j++) {
          if (h_keys_test[j] == h_keys_temp[i]) {
            is_accum_test = h_accum_or_assigns_test[j];
            test_score = h_scores_test[j];
            break;
          }
        }
        if (is_found_on_base && is_accum_test) expected_score = test_score;
        if (is_found_on_base && !is_accum_test) expected_score = base_score;
        if (!is_found_on_base && is_accum_test) assert(false);
        if (!is_found_on_base && !is_accum_test) expected_score = test_score;

        // Some keys in base could be evicted in one operation that allows the
        // same key with `assign` flag in the test can be inserted.
        ASSERT_EQ(expected_score, h_scores_temp[i])
            << " " << is_found_on_base << " " << is_accum_test << " "
            << base_score << " " << test_score;
        if (is_found_on_base && is_accum_test) {
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                      static_cast<float>(h_keys_temp[i] * 0.00002));
          }
        } else {
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                      static_cast<float>(h_keys_temp[i] * 0.00001));
          }
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));
  CUDA_CHECK(cudaFree(d_accum_or_assigns_temp));
  CUDA_CHECK(cudaFree(d_found_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors,
                                             int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 8;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 256;
  constexpr float base_true_ratio = 0.0f;
  constexpr float test_true_ratio = 0.5f;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);
  std::vector<uint8_t> h_accum_or_assigns_base(BASE_KEY_NUM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);
  std::vector<uint8_t> h_accum_or_assigns_test(TEST_KEY_NUM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;
  bool* d_accum_or_assigns_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns_temp, TEMP_KEY_NUM * sizeof(bool)));

  test_util::create_random_bools<K>(
      reinterpret_cast<bool*>(h_accum_or_assigns_base.data()), BASE_KEY_NUM,
      base_true_ratio);
  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  const S base_score_start = 1000;
  for (int i = 0; i < BASE_KEY_NUM; i++) {
    h_scores_base[i] = base_score_start + i;
  }

  test_util::create_random_bools<K>(
      reinterpret_cast<bool*>(h_accum_or_assigns_test.data()), TEST_KEY_NUM,
      test_true_ratio);
  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[4] = h_keys_base[72];
  h_keys_test[5] = h_keys_base[73];
  h_keys_test[6] = h_keys_base[74];
  h_keys_test[7] = h_keys_base[75];

  h_accum_or_assigns_base[72] = false;
  h_accum_or_assigns_base[73] = false;
  h_accum_or_assigns_base[74] = false;
  h_accum_or_assigns_base[75] = false;
  // replace four new keys to lower scores, would not be inserted.
  h_scores_test[0] = 20;
  h_scores_test[1] = 78;
  h_scores_test[2] = 97;
  h_scores_test[3] = 98;

  // replace three exist keys to new scores, just refresh the score for them.
  h_scores_test[4] = 99;
  h_scores_test[5] = 1010;
  h_scores_test[6] = 1020;
  h_scores_test[7] = 1035;

  h_accum_or_assigns_test[0] = false;
  h_accum_or_assigns_test[1] = false;
  h_accum_or_assigns_test[2] = false;
  h_accum_or_assigns_test[3] = false;

  h_accum_or_assigns_test[4] = true;
  h_accum_or_assigns_test[5] = true;
  h_accum_or_assigns_test[6] = true;
  h_accum_or_assigns_test[7] = false;

  for (int i = 4; i < TEST_KEY_NUM; i++) {
    for (int j = 0; j < options.dim; j++) {
      h_vectors_test[i * options.dim + j] =
          static_cast<V>(h_keys_test[i] * 0.00001);
    }
  }

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(
          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_base.data(),
                     BASE_KEY_NUM * sizeof(uint8_t), cudaMemcpyHostToDevice));
      table->accum_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                             d_accum_or_assigns_temp, d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t expected_size = 0;
      for (const auto accum : h_accum_or_assigns_base) {
        if (!accum) expected_size++;
      }

      ASSERT_EQ(total_size, expected_size);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, expected_size);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      for (int i = 0; i < dump_counter; i++) {
        S expected_score = 0ul;
        for (int j = 0; j < BASE_KEY_NUM; j++) {
          if (h_keys_temp[i] == h_keys_base[j]) {
            expected_score = h_scores_base[j];
            break;
          }
        }
        ASSERT_EQ(h_scores_temp[i], expected_score);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      std::unordered_map<K, bool> base_found_map;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(
          cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_test.data(),
                     TEST_KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));

      table->accum_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                             d_accum_or_assigns_temp, d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t expected_size = 0;
      for (const auto accum : h_accum_or_assigns_base) {
        if (!accum) expected_size++;
      }
      expected_size = std::max(expected_size, BUCKET_MAX_SIZE);
      ASSERT_EQ(total_size, expected_size);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, expected_size);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < TEST_KEY_NUM; i++) {
        if (i < 4) {
          ASSERT_EQ(h_keys_temp.end(),
                    std::find(h_keys_temp.begin(), h_keys_temp.end(),
                              h_keys_test[i]));
        } else {
          ASSERT_NE(h_keys_temp.end(),
                    std::find(h_keys_temp.begin(), h_keys_temp.end(),
                              h_keys_test[i]));
        }
      }
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_temp[i] == h_keys_test[4])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[4]);
        if (h_keys_temp[i] == h_keys_test[5])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[5]);
        if (h_keys_temp[i] == h_keys_test[6])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[6]);
        if (h_keys_temp[i] == h_keys_test[7])
          ASSERT_NE(h_scores_temp[i], h_scores_test[7]);
        bool is_accum =
            (h_keys_temp[i] != h_keys_test[7]) &&
            (h_keys_test.end() != std::find(h_keys_test.begin() + 4,
                                            h_keys_test.end(), h_keys_temp[i]));
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] *
                                       (is_accum ? 0.00002 : 0.00001)));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));
  CUDA_CHECK(cudaFree(d_accum_or_assigns_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors,
                                                 int key_start = 0) {
  constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;
  constexpr uint64_t STEPS = 128;
  constexpr uint64_t MAX_BUCKET_SIZE = 128;
  constexpr uint64_t INIT_CAPACITY = BATCH_SIZE * STEPS;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t TEST_TIMES = 1;
  float expected_correct_rate = 0.964;
  const int rounds = 3;
  constexpr float true_ratio = 0.0;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = MAX_BUCKET_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  K* h_keys_base = test_util::HostBuffer<K>(BATCH_SIZE).ptr();
  S* h_scores_base = test_util::HostBuffer<S>(BATCH_SIZE).ptr();
  V* h_vectors_base = test_util::HostBuffer<V>(BATCH_SIZE * options.dim).ptr();
  bool* h_accum_or_assigns_base = test_util::HostBuffer<bool>(BATCH_SIZE).ptr();

  K* h_keys_temp = test_util::HostBuffer<K>(MAX_CAPACITY).ptr();
  S* h_scores_temp = test_util::HostBuffer<S>(MAX_CAPACITY).ptr();
  V* h_vectors_temp =
      test_util::HostBuffer<V>(MAX_CAPACITY * options.dim).ptr();

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;
  bool* d_accum_or_assigns_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, MAX_CAPACITY * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, MAX_CAPACITY * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, MAX_CAPACITY * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_accum_or_assigns_temp, MAX_CAPACITY * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t global_start_key = 100000;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    size_t start_key = global_start_key;

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    for (int r = 0; r < rounds; r++) {
      size_t expected_min_key = global_start_key + INIT_CAPACITY * r;
      size_t expected_max_key = global_start_key + INIT_CAPACITY * (r + 1) - 1;
      size_t expected_table_size =
          (r == 0) ? size_t(expected_correct_rate * INIT_CAPACITY)
                   : INIT_CAPACITY;

      for (int s = 0; s < STEPS; s++) {
        test_util::create_random_bools<K>(h_accum_or_assigns_base, BATCH_SIZE,
                                          true_ratio);
        test_util::create_continuous_keys<K, S, V, DIM>(
            h_keys_base, h_scores_base, h_vectors_base, BATCH_SIZE, start_key);
        start_key += BATCH_SIZE;

        CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base, BATCH_SIZE * sizeof(K),
                              cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base,
                              BATCH_SIZE * sizeof(S), cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base,
                              BATCH_SIZE * sizeof(V) * options.dim,
                              cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_accum_or_assigns_temp, h_accum_or_assigns_base,
                              BATCH_SIZE * sizeof(bool),
                              cudaMemcpyHostToDevice));
        table->accum_or_assign(BATCH_SIZE, d_keys_temp, d_vectors_temp,
                               d_accum_or_assigns_temp, d_scores_temp, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
      }

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_GE(total_size, expected_table_size);
      ASSERT_EQ(MAX_CAPACITY, table->capacity());

      size_t dump_counter = table->export_batch(
          MAX_CAPACITY, 0, d_keys_temp, d_vectors_temp, d_scores_temp, stream);

      CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys_temp, MAX_CAPACITY * sizeof(K),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_scores_temp, d_scores_temp,
                            MAX_CAPACITY * sizeof(S), cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors_temp,
                            MAX_CAPACITY * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));

      size_t bigger_score_counter = 0;
      K max_key = 0;
      size_t values_error_counter = 0;
      for (int i = 0; i < dump_counter; i++) {
        ASSERT_EQ(h_keys_temp[i], h_scores_temp[i]);
        max_key = std::max(max_key, h_keys_temp[i]);
        if (h_scores_temp[i] >= expected_min_key) bigger_score_counter++;
        for (int j = 0; j < options.dim; j++) {
          if (h_vectors_temp[i * options.dim + j] !=
              static_cast<float>(h_keys_temp[i] * 0.00001)) {
            values_error_counter++;
          }
        }
      }

      ASSERT_EQ(values_error_counter, 0);
      float correct_rate = (bigger_score_counter * 1.0) / MAX_CAPACITY;
      std::cout << std::setprecision(3) << "[Round " << r << "]"
                << "correct_rate=" << correct_rate << std::endl;
      ASSERT_GE(max_key, expected_max_key);
      ASSERT_GE(correct_rate, expected_correct_rate);
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));
  CUDA_CHECK(cudaFree(d_accum_or_assigns_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors,
                                          int key_start = 0) {
  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
  constexpr uint64_t INIT_CAPACITY = 4 * 1024 - BUCKET_MAX_SIZE - 1;
  constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 256;
  constexpr uint64_t THREAD_N = 8;

  std::vector<std::thread> threads;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_load_factor = 0.50f;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::shared_ptr<Table> table = std::make_shared<Table>();
  table->init(options);
  ASSERT_EQ(table->bucket_count(), 32);

  auto worker_function = [&table, KEY_NUM, options](int task_n) {
    constexpr float true_ratio = 0.5;
    K* h_keys;
    V* h_vectors;
    bool* h_found;
    bool* h_accum_or_assigns;
    K* h_keys_temp;
    V* h_vectors_temp;
    bool* h_found_temp;

    size_t current_capacity = table->capacity();

    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMallocHost(&h_keys_temp, KEY_NUM * sizeof(K)));
    CUDA_CHECK(
        cudaMallocHost(&h_vectors_temp, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMallocHost(&h_found_temp, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMallocHost(&h_accum_or_assigns, KEY_NUM * sizeof(bool)));

    K* d_keys;
    V* d_vectors;
    bool* d_found;
    bool* d_accum_or_assigns;

    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMalloc(&d_accum_or_assigns, KEY_NUM * sizeof(bool)));

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    while (table->capacity() * 2 < MAX_CAPACITY) {
      test_util::create_random_bools<K>(h_accum_or_assigns, KEY_NUM,
                                        true_ratio);

      test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,
                                                  KEY_NUM);
      CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_accum_or_assigns, h_accum_or_assigns,
                            KEY_NUM * sizeof(bool), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

      table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys, KEY_NUM * sizeof(K),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_found_temp, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));

      table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_accum_or_assigns,
                             nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);

      CUDA_CHECK(cudaStreamSynchronize(stream));
      int found_num = 0;
      size_t expected_size = 0;
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found_temp[i] || !h_accum_or_assigns[i]) expected_size++;
      }

      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));

      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) {
          found_num++;
          for (int j = 0; j < options.dim; j++) {
            if (h_found_temp[i] && h_accum_or_assigns[i]) {
              ASSERT_EQ(h_vectors[i * options.dim + j],
                        h_vectors_temp[i * options.dim + j] +
                            static_cast<float>(h_keys[i] * 0.00001));
            } else {
              ASSERT_EQ(h_vectors[i * options.dim + j],
                        static_cast<float>(h_keys[i] * 0.00001));
            }
          }
        }
      }
      ASSERT_EQ(found_num, expected_size);
      if (task_n == 0 && current_capacity != table->capacity()) {
        std::cout << "[test_dynamic_rehash_on_multi_threads] The capacity "
                     "changed from "
                  << current_capacity << " to " << table->capacity()
                  << std::endl;
        current_capacity = table->capacity();
      }
      CUDA_CHECK(cudaStreamSynchronize(stream));
    }
    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaFreeHost(h_keys));
    CUDA_CHECK(cudaFreeHost(h_found));
    CUDA_CHECK(cudaFreeHost(h_vectors));
    CUDA_CHECK(cudaFreeHost(h_keys_temp));
    CUDA_CHECK(cudaFreeHost(h_found_temp));
    CUDA_CHECK(cudaFreeHost(h_vectors_temp));
    CUDA_CHECK(cudaFreeHost(h_accum_or_assigns));

    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_vectors));
    CUDA_CHECK(cudaFree(d_found));
    CUDA_CHECK(cudaFree(d_accum_or_assigns));
    CUDA_CHECK(cudaDeviceSynchronize());

    CudaCheckError();
  };

  for (int i = 0; i < THREAD_N; ++i)
    threads.emplace_back(std::thread(worker_function, i));

  for (auto& th : threads) {
    th.join();
  }
  ASSERT_GE(table->capacity() * 2, MAX_CAPACITY);
}

template <typename K, typename V, typename S, typename Table, size_t dim = 64>
void CheckAccumOrAssignValues(Table* table,
                              test_util::KVMSBuffer<K, V, S>& data_buffer,
                              size_t len, cudaStream_t stream) {
  std::map<K, test_util::ValueArray<V, dim>> map_before_insert;
  std::map<K, test_util::ValueArray<V, dim>> map_after_insert;
  std::unordered_set<K> map_current_batch;

  K* keys = data_buffer.keys_ptr();
  V* values = data_buffer.values_ptr();
  S* scores = data_buffer.scores_ptr();

  for (int i = 0; i < len; i++) {
    map_current_batch.insert(data_buffer.keys_ptr(false)[i]);
  }

  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;
  bool* h_tmp_accum_or_assigns = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;
  bool* d_tmp_accum_or_assigns = nullptr;

  size_t table_size_before = table->size(stream);
  size_t cap = table_size_before + len;

  h_tmp_keys = (K*)malloc(cap * sizeof(K));
  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
  h_tmp_scores = (S*)malloc(cap * sizeof(S));
  h_tmp_accum_or_assigns = (bool*)malloc(cap * sizeof(bool));

  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));
  CUDA_CHECK(
      cudaMallocAsync(&d_tmp_accum_or_assigns, cap * sizeof(bool), stream));
  CUDA_CHECK(
      cudaMemsetAsync(d_tmp_accum_or_assigns, 0, cap * sizeof(bool), stream));

  table->find(len, keys, d_tmp_values, d_tmp_accum_or_assigns, nullptr, stream);
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_accum_or_assigns, d_tmp_accum_or_assigns,
                             len * sizeof(bool), cudaMemcpyDeviceToHost,
                             stream));

  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));

  size_t table_size_verify0 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);
  ASSERT_EQ(table_size_before, table_size_verify0);

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_before * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_before * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_before * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < table_size_verify0; i++) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    map_before_insert[h_tmp_keys[i]] = *vec;
  }

  auto start = std::chrono::steady_clock::now();
  table->accum_or_assign(len, keys, values, d_tmp_accum_or_assigns, nullptr,
                         stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::steady_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

  float dur = diff.count();

  size_t table_size_after = table->size(stream);
  size_t table_size_verify1 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

  ASSERT_EQ(table_size_verify1, table_size_after);

  size_t new_cap = table_size_after;
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_after * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_after * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_after * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_accum_or_assigns, d_tmp_accum_or_assigns,
                             table_size_after * sizeof(bool),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int64_t new_cap_K = (int64_t)new_cap;
  for (int64_t i = new_cap_K - 1; i >= 0; i--) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    map_after_insert[h_tmp_keys[i]] = *vec;
  }

  size_t value_diff_cnt = 0;
  for (auto& it : map_after_insert) {
    test_util::ValueArray<V, dim>& vec = map_after_insert.at(it.first);
    bool existed =
        (map_before_insert.end() != map_before_insert.find(it.first));
    bool required =
        (map_current_batch.end() != map_current_batch.find(it.first));
    V expected_value = 0;

    if (existed) {
      if (required) {
        expected_value = (map_before_insert.at(it.first)[0] +
                          static_cast<V>(it.first * 0.00001));
      } else {
        expected_value = map_before_insert.at(it.first)[0];
      }
    } else {
      if (required) {
        expected_value = static_cast<V>(it.first * 0.00001);
      }
    }
    for (size_t j = 0; j < dim; j++) {
      if (vec[j] != expected_value) {
        ++value_diff_cnt;
        break;
      }
    }
  }
  ASSERT_EQ(value_diff_cnt, 0);
  std::cout << "Check accum_or_assign behavior got "
            << "value_diff_cnt: " << value_diff_cnt
            << ", while table_size_before: " << table_size_before
            << ", while table_size_after: " << table_size_after
            << ", while len: " << len << std::endl;

  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_accum_or_assigns, stream));
  free(h_tmp_keys);
  free(h_tmp_values);
  free(h_tmp_scores);
  free(h_tmp_accum_or_assigns);
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

void test_accum_or_assign_values_check(size_t max_hbm_for_vectors) {
  const size_t U = 524288;
  const size_t init_capacity = 1024;
  const size_t B = 524288 + 13;
  constexpr size_t dim = 64;

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;
  opt.dim = dim;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<K, V, S> data_buffer;
  data_buffer.Reserve(B, dim, stream);

  size_t offset = 0;
  S score = 0;
  for (int i = 0; i < 20; i++) {
    test_util::create_random_keys<K, S, V, dim>(
        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),
        data_buffer.values_ptr(false), (int)B, B * 16);
    data_buffer.SyncData(true, stream);

    CheckAccumOrAssignValues<K, V, S, Table, dim>(table.get(), data_buffer, B,
                                                  stream);

    offset += B;
    score += 1;
  }
}

TEST(AccumOrAssignTest, test_export_batch_if) {
  test_export_batch_if(16, 22);
  test_export_batch_if(0, 0);
}
TEST(AccumOrAssignTest, test_basic_when_full) {
  test_basic_when_full(16, 2);
  test_basic_when_full(0, 0);
}
TEST(AccumOrAssignTest, test_erase_if_pred) {
  test_erase_if_pred(16, 0);
  test_erase_if_pred(0, 5);
}
TEST(AccumOrAssignTest, test_rehash) {
  test_rehash(16, 7);
  test_rehash(0, 0);
}
TEST(AccumOrAssignTest, test_rehash_on_big_batch) {
  test_rehash_on_big_batch(16, 9);
  test_rehash_on_big_batch(0, 0);
}
TEST(AccumOrAssignTest, test_dynamic_rehash_on_multi_threads) {
  test_dynamic_rehash_on_multi_threads(16, 56);
  test_dynamic_rehash_on_multi_threads(0);
}
TEST(AccumOrAssignTest, test_evict_strategy_lru_basic) {
  test_evict_strategy_lru_basic(16);
  test_evict_strategy_lru_basic(0);
}
TEST(AccumOrAssignTest, test_evict_strategy_lfu_basic) {
  test_evict_strategy_lfu_basic(16, 3);
  // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.
  // test_evict_strategy_lfu_basic(0);
}

TEST(AccumOrAssignTest, test_evict_strategy_epochlru_basic) {
  test_evict_strategy_epochlru_basic(16, 33);
  test_evict_strategy_epochlru_basic(0);
}

TEST(AccumOrAssignTest, test_evict_strategy_epochlfu_basic) {
  test_evict_strategy_epochlfu_basic(16);
  test_evict_strategy_epochlfu_basic(0, 44);
}
TEST(AccumOrAssignTest, test_evict_strategy_customized_basic) {
  test_evict_strategy_customized_basic(16);
  test_evict_strategy_customized_basic(0, 23);
}
TEST(AccumOrAssignTest, test_evict_strategy_customized_advanced) {
  test_evict_strategy_customized_advanced(16, 16);
  test_evict_strategy_customized_advanced(0);
}
TEST(AccumOrAssignTest, test_evict_strategy_customized_correct_rate) {
  // TODO(rhdong): after blossom CI issue is resolved, the skip logic.
  const bool skip_hmem_check = (nullptr != std::getenv("IS_BLOSSOM_CI"));
  test_evict_strategy_customized_correct_rate(16, 61);
  if (!skip_hmem_check) {
    test_evict_strategy_customized_correct_rate(0);
  } else {
    std::cout << "The HMEM check is skipped in blossom CI!" << std::endl;
  }
}

TEST(AccumOrAssignTest, test_accum_or_assign_values_check) {
  test_accum_or_assign_values_check(16);
  // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.
  // test_insert_or_assign_values_check(0);
}

================================================
FILE: tests/assign_score_test.cc.cu
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * test APIs: find_or_insert and assign,
 * move insert operation from `insert_or_assign` to `find`.
 */

#include <gtest/gtest.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <array>
#include <iostream>
#include <thread>
#include <vector>
#include "merlin_hashtable.cuh"
#include "test_util.cuh"

constexpr size_t DIM = 16;
using K = uint64_t;
using V = float;
using S = uint64_t;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;

void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors,
                                   int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      S start_ts = test_util::host_nano<S>(stream);
      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,
                            stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts = test_util::host_nano<S>(stream);

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      ASSERT_GE(h_scores_temp_sorted[0], start_ts);
      ASSERT_LE(h_scores_temp_sorted[TEST_KEY_NUM - 1], end_ts);
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,
                            stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      S start_ts = test_util::host_nano<S>(stream);
      table->assign(TEST_KEY_NUM, d_keys_temp, nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts = test_util::host_nano<S>(stream);

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(TEST_KEY_NUM);
      int ctr = 0;
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {
          ASSERT_GT(h_scores_temp[i], BUCKET_MAX_SIZE);
          h_scores_temp_sorted[ctr++] = h_scores_temp[i];
        } else {
          ASSERT_LE(h_scores_temp[i], start_ts);
        }
      }
      std::sort(h_scores_temp_sorted.begin(),
                h_scores_temp_sorted.begin() + ctr);

      ASSERT_GE(h_scores_temp_sorted[0], start_ts);
      ASSERT_LE(h_scores_temp_sorted[ctr - 1], end_ts);
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors,
                                   int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  int freq_range = 1000;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,
      freq_range);

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD, freq_range);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  h_scores_test[2] = h_keys_base[72] % freq_range;
  h_scores_test[3] = h_keys_base[73] % freq_range;

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        ASSERT_EQ(h_scores_temp[i], h_keys_temp[i] % freq_range);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));

      table->assign(TEST_KEY_NUM, d_keys_temp, d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        bool in_base =
            h_keys_base.end() !=
            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);
        bool in_test =
            h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);
        if (in_base && in_test) {
          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range) *
                                          3);  // will update score when found.
        } else {
          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range));
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors,
                                        int key_start = 0) {
  constexpr int RSHIFT_ON_NANO = 20;

  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      S start_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
      table->set_global_epoch(global_epoch);
      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,
                            stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));
      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1],
                (global_epoch << 32 | end_ts));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      S start_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
      table->set_global_epoch(global_epoch);
      table->assign(TEST_KEY_NUM, d_keys_temp, nullptr, stream);
      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,
                            stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(TEST_KEY_NUM);
      int ctr = 0;
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {
          ASSERT_GE(h_scores_temp[i], (global_epoch << 32 | start_ts));
          h_scores_temp_sorted[ctr++] = h_scores_temp[i];
        } else {
          ASSERT_LE(h_scores_temp[i], (global_epoch << 32 | start_ts));
        }
      }
      std::sort(h_scores_temp_sorted.begin(),
                h_scores_temp_sorted.begin() + ctr);

      ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));
      ASSERT_LE(h_scores_temp_sorted[ctr - 1], (global_epoch << 32 | end_ts));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors,
                                        int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  int freq_range = 1000;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,
      freq_range);

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD, freq_range);

  // Simulate overflow of low 32bits.
  h_scores_base[71] = static_cast<S>(std::numeric_limits<uint32_t>::max() -
                                     static_cast<uint32_t>(1));

  h_keys_test[1] = h_keys_base[71];
  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  h_scores_test[1] = h_scores_base[71];
  h_scores_test[2] = h_keys_base[72] % freq_range;
  h_scores_test[3] = h_keys_base[73] % freq_range;

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[1 * options.dim + i] = h_vectors_base[71 * options.dim + i];
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->set_global_epoch(global_epoch);
      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        if (h_keys_temp[i] == h_keys_base[71]) {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, h_scores_base[71]);
          ASSERT_EQ(h_scores_temp[i], expected_score);
        } else {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, (h_keys_temp[i] % freq_range));
          ASSERT_EQ(h_scores_temp[i], expected_score);
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->set_global_epoch(global_epoch);
      table->assign(TEST_KEY_NUM, d_keys_temp, d_scores_temp, stream);
      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      ASSERT_TRUE(h_keys_temp.end() != std::find(h_keys_temp.begin(),
                                                 h_keys_temp.end(),
                                                 h_keys_base[71]));

      for (int i = 0; i < dump_counter; i++) {
        bool in_base =
            h_keys_base.end() !=
            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);
        bool in_test =
            h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);

        if (in_base && in_test) {
          if (h_keys_temp[i] == h_keys_base[71]) {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch, h_scores_base[71] * 2);
            ASSERT_EQ(h_scores_temp[i], expected_score);
          } else {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch, (h_keys_temp[i] % freq_range) *
                                  3);  // will update score when found.
            ASSERT_EQ(h_scores_temp[i], expected_score);
          }
        } else {
          if (h_keys_temp[i] == h_keys_base[71]) {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch - static_cast<S>(in_base), h_scores_base[71]);
            ASSERT_EQ(h_scores_temp[i], expected_score);
          } else {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch - static_cast<S>(in_base),
                (h_keys_temp[i] % freq_range));

            ASSERT_EQ(h_scores_temp[i], expected_score);
          }
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors,
                                          int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 128;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  const S base_score_start = 1000;
  for (int i = 0; i < BASE_KEY_NUM; i++) {
    h_scores_base[i] = base_score_start + i;
  }

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);
  const S test_score_start = base_score_start + BASE_KEY_NUM;
  for (int i = 0; i < TEST_KEY_NUM; i++) {
    h_scores_test[i] = test_score_start + i;
  }
  for (int i = 64; i < TEST_KEY_NUM; i++) {
    h_keys_test[i] = h_keys_base[i];
    for (int j = 0; j < options.dim; j++) {
      h_vectors_test[i * options.dim + j] = h_vectors_base[i * options.dim + j];
    }
  }

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->assign(TEST_KEY_NUM, d_keys_temp, d_scores_temp, stream);
      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range_test =
          test_util::range<S, TEST_KEY_NUM>(test_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range_test.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors,
                                             int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 8;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 256;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  const S base_score_start = 1000;
  for (int i = 0; i < BASE_KEY_NUM; i++) {
    h_scores_base[i] = base_score_start + i;
  }

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[4] = h_keys_base[72];
  h_keys_test[5] = h_keys_base[73];
  h_keys_test[6] = h_keys_base[74];
  h_keys_test[7] = h_keys_base[75];

  // replace four new keys to lower scores, would not be inserted.
  h_scores_test[0] = 20;
  h_scores_test[1] = 78;
  h_scores_test[2] = 97;
  h_scores_test[3] = 98;

  // replace three exist keys to new scores, just refresh the score for them.
  h_scores_test[4] = 99;
  h_scores_test[5] = 1010;
  h_scores_test[6] = 1020;
  h_scores_test[7] = 1035;

  for (int i = 4; i < TEST_KEY_NUM; i++) {
    for (int j = 0; j < options.dim; j++) {
      h_vectors_test[i * options.dim + j] =
          static_cast<V>(h_keys_test[i] * 0.00001);
    }
  }

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->assign(TEST_KEY_NUM, d_keys_temp, d_scores_temp, stream);
      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < TEST_KEY_NUM; i++) {
        if (i < 4) {
          ASSERT_EQ(h_keys_temp.end(),
                    std::find(h_keys_temp.begin(), h_keys_temp.end(),
                              h_keys_test[i]));
        } else {
          ASSERT_NE(h_keys_temp.end(),
                    std::find(h_keys_temp.begin(), h_keys_temp.end(),
                              h_keys_test[i]));
        }
      }
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_temp[i] == h_keys_test[4])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[4]);
        if (h_keys_temp[i] == h_keys_test[5])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[5]);
        if (h_keys_temp[i] == h_keys_test[6])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[6]);
        if (h_keys_temp[i] == h_keys_test[7])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[7]);

        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

template <typename K, typename V, typename S, typename Table, size_t dim = 64>
void CheckAssignOnEpochLfu(Table* table,
                           test_util::KVMSBuffer<K, V, S>* data_buffer,
                           test_util::KVMSBuffer<K, V, S>* evict_buffer,
                           test_util::KVMSBuffer<K, V, S>* pre_data_buffer,
                           size_t len, cudaStream_t stream, TableOptions& opt,
                           unsigned int global_epoch) {
  std::map<K, test_util::ValueArray<V, dim>> values_map_before_insert;
  std::map<K, test_util::ValueArray<V, dim>> values_map_after_insert;

  std::unordered_map<K, S> scores_map_before_insert;
  std::map<K, S> scores_map_after_insert;

  std::map<K, S> scores_map_current_batch;
  std::map<K, S> scores_map_current_evict;

  K* keys = data_buffer->keys_ptr();
  V* values = data_buffer->values_ptr();
  S* scores = data_buffer->scores_ptr();

  K* evicted_keys = evict_buffer->keys_ptr();
  V* evicted_values = evict_buffer->values_ptr();
  S* evicted_scores = evict_buffer->scores_ptr();

  for (size_t i = 0; i < len; i++) {
    scores_map_current_batch[data_buffer->keys_ptr(false)[i]] =
        data_buffer->scores_ptr(false)[i];
  }

  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;
  bool* h_tmp_founds = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;
  bool* d_tmp_founds = nullptr;

  size_t table_size_before = table->size(stream);
  size_t cap = table_size_before + len;

  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));
  h_tmp_keys = (K*)malloc(cap * sizeof(K));
  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
  h_tmp_scores = (S*)malloc(cap * sizeof(S));
  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));

  size_t table_size_verify0 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);
  ASSERT_EQ(table_size_before, table_size_verify0);

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_before * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_before * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_before * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys,
                             len * sizeof(K), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim, values,
                             len * dim * sizeof(V), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before, scores,
                             len * sizeof(S), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < cap; i++) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    values_map_before_insert[h_tmp_keys[i]] = *vec;
  }

  for (size_t i = 0; i < table_size_before; i++) {
    scores_map_before_insert[h_tmp_keys[i]] = h_tmp_scores[i];
  }

  table->set_global_epoch(global_epoch);
  table->assign(len, keys, scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  {
    size_t table_size_verify1 = table->export_batch(
        table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

    CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                               table_size_before * sizeof(K),
                               cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                               table_size_before * dim * sizeof(V),
                               cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                               table_size_before * sizeof(S),
                               cudaMemcpyDeviceToHost, stream));

    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(table_size_verify1, table_size_before);

    size_t score_error_cnt = 0;

    for (int64_t i = table_size_before - 1; i >= 0; i--) {
      test_util::ValueArray<V, dim>* vec =
          reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                           i * dim);
      values_map_after_insert[h_tmp_keys[i]] = *vec;
      scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];
    }

    for (auto it : scores_map_current_batch) {
      const K key = it.first;
      const K score = it.second;
      S current_score = scores_map_after_insert[key];
      S score_before_insert = 0;
      if (scores_map_before_insert.find(key) !=
          scores_map_before_insert.end()) {
        score_before_insert = scores_map_before_insert[key];
        bool valid =
            ((current_score >> 32) == global_epoch) &&
            ((current_score & 0xFFFFFFFF) ==
             ((0xFFFFFFFF & score_before_insert) + (0xFFFFFFFF & score)));

        if (!valid) {
          score_error_cnt++;
        }
      }
    }
    std::cout << "Check assign behavior got "
              << ", score_error_cnt: " << score_error_cnt
              << ", while len: " << len << std::endl;
    ASSERT_EQ(score_error_cnt, 0);
  }

  for (int64_t i = 0; i < table_size_before; i++) {
    values_map_before_insert[h_tmp_keys[i]] =
        values_map_after_insert[h_tmp_keys[i]];
    scores_map_before_insert[h_tmp_keys[i]] =
        scores_map_after_insert[h_tmp_keys[i]];
  }
  values_map_after_insert.clear();
  scores_map_after_insert.clear();

  auto start = std::chrono::steady_clock::now();
  size_t filtered_len = table->insert_and_evict(
      len, keys, values,
      (Table::evict_strategy == EvictStrategy::kLru ||
       Table::evict_strategy == EvictStrategy::kEpochLru)
          ? nullptr
          : scores,
      evicted_keys, evicted_values, evicted_scores, stream);
  evict_buffer->SyncData(false, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::steady_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

  for (size_t i = 0; i < filtered_len; i++) {
    scores_map_current_evict[evict_buffer->keys_ptr(false)[i]] =
        evict_buffer->scores_ptr(false)[i];
  }

  float dur = diff.count();

  size_t table_size_after = table->size(stream);
  size_t table_size_verify1 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

  ASSERT_EQ(table_size_verify1, table_size_after);

  size_t new_cap = table_size_after + filtered_len;
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_after * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_after * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_after * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,
                             filtered_len * sizeof(K), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,
                             evicted_values, filtered_len * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after, evicted_scores,
                             filtered_len * sizeof(S), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  size_t key_miss_cnt = 0;
  size_t value_diff_cnt = 0;
  size_t score_error_cnt1 = 0;
  size_t score_error_cnt2 = 0;

  for (int64_t i = new_cap - 1; i >= 0; i--) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    values_map_after_insert[h_tmp_keys[i]] = *vec;
    scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];
    if (i >= (new_cap - filtered_len)) {
      bool valid = ((h_tmp_scores[i] >> 32) < (global_epoch - 2));
      if (!valid) {
        score_error_cnt1++;
      }
    }
  }

  for (auto it : scores_map_current_batch) {
    const K key = it.first;
    const K score = it.second;
    S current_score = scores_map_after_insert[key];
    S score_before_insert = 0;
    if (values_map_after_insert.find(key) != values_map_after_insert.end() &&
        scores_map_current_evict.find(key) == scores_map_current_evict.end()) {
      score_before_insert = scores_map_before_insert[key];
    }
    bool valid = ((current_score >> 32) == global_epoch) &&
                 ((current_score & 0xFFFFFFFF) ==
                  ((0xFFFFFFFF & score_before_insert) + (0xFFFFFFFF & score)));

    if (!valid) {
      score_error_cnt2++;
    }
  }

  for (auto& it : values_map_before_insert) {
    if (values_map_after_insert.find(it.first) ==
        values_map_after_insert.end()) {
      ++key_miss_cnt;
      continue;
    }
    test_util::ValueArray<V, dim>& vec0 = it.second;
    test_util::ValueArray<V, dim>& vec1 = values_map_after_insert.at(it.first);
    for (size_t j = 0; j < dim; j++) {
      if (vec0[j] != vec1[j]) {
        ++value_diff_cnt;
        break;
      }
    }
  }

  std::cout << "Check insert_and_evict behavior got "
            << "key_miss_cnt: " << key_miss_cnt
            << ", value_diff_cnt: " << value_diff_cnt
            << ", score_error_cnt1: " << score_error_cnt1
            << ", score_error_cnt2: " << score_error_cnt2
            << ", while table_size_before: " << table_size_before
            << ", while table_size_after: " << table_size_after
            << ", while len: " << len << std::endl;

  ASSERT_EQ(key_miss_cnt, 0);
  ASSERT_EQ(value_diff_cnt, 0);
  ASSERT_EQ(score_error_cnt1, 0);
  ASSERT_EQ(score_error_cnt2, 0);

  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));
  free(h_tmp_keys);
  free(h_tmp_values);
  free(h_tmp_scores);
  free(h_tmp_founds);
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

void test_assign_advanced_on_epochlfu(size_t max_hbm_for_vectors) {
  const size_t U = 1024 * 1024;
  const size_t B = 100000;
  constexpr size_t dim = 16;

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = U;
  opt.max_hbm_for_vectors = U * dim * sizeof(V);
  opt.max_bucket_size = 128;
  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;
  opt.dim = dim;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<K, V, S> evict_buffer;
  evict_buffer.Reserve(B, dim, stream);
  evict_buffer.ToZeros(stream);

  test_util::KVMSBuffer<K, V, S> data_buffer;
  test_util::KVMSBuffer<K, V, S> pre_data_buffer;
  data_buffer.Reserve(B, dim, stream);
  pre_data_buffer.Reserve(B, dim, stream);

  size_t offset = 0;
  int freq_range = 100;
  float repeat_rate = 0.9;
  for (unsigned int global_epoch = 1; global_epoch <= 20; global_epoch++) {
    repeat_rate = global_epoch <= 1 ? 0.0 : 0.1;
    if (global_epoch <= 1) {
      test_util::create_random_keys_advanced<K, S, V>(
          dim, data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),
          data_buffer.values_ptr(false), (int)B, B * 32, freq_range);
    } else {
      test_util::create_random_keys_advanced<K, S, V>(
          dim, data_buffer.keys_ptr(false), pre_data_buffer.keys_ptr(false),
          data_buffer.scores_ptr(false), data_buffer.values_ptr(false), (int)B,
          B * 32, freq_range, repeat_rate);
    }
    data_buffer.SyncData(true, stream);
    if (global_epoch <= 1) {
      pre_data_buffer.CopyFrom(data_buffer, stream);
    }

    CheckAssignOnEpochLfu<K, V, S, Table, dim>(table.get(), &data_buffer,
                                               &evict_buffer, &pre_data_buffer,
                                               B, stream, opt, global_epoch);

    pre_data_buffer.CopyFrom(data_buffer, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    offset += B;
  }
}

void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors,
                                                 int key_start = 0) {
  constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;
  constexpr uint64_t STEPS = 128;
  constexpr uint64_t MAX_BUCKET_SIZE = 128;
  constexpr uint64_t INIT_CAPACITY = BATCH_SIZE * STEPS;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t TEST_TIMES = 1;
  float expected_correct_rate = 0.964;
  const int rounds = 12;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = MAX_BUCKET_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  K* h_keys_base = test_util::HostBuffer<K>(BATCH_SIZE).ptr();
  S* h_scores_base = test_util::HostBuffer<S>(BATCH_SIZE).ptr();
  V* h_vectors_base = test_util::HostBuffer<V>(BATCH_SIZE * options.dim).ptr();

  K* h_keys_temp = test_util::HostBuffer<K>(MAX_CAPACITY).ptr();
  S* h_scores_temp = test_util::HostBuffer<S>(MAX_CAPACITY).ptr();
  V* h_vectors_temp =
      test_util::HostBuffer<V>(MAX_CAPACITY * options.dim).ptr();

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, MAX_CAPACITY * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, MAX_CAPACITY * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, MAX_CAPACITY * sizeof(V) * options.dim));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t global_start_key = 100000;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    size_t start_key = global_start_key;

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    for (int r = 0; r < rounds; r++) {
      size_t expected_min_key = global_start_key + INIT_CAPACITY * r;
      size_t expected_max_key = global_start_key + INIT_CAPACITY * (r + 1) - 1;
      size_t expected_table_size =
          (r == 0) ? size_t(expected_correct_rate * INIT_CAPACITY)
                   : INIT_CAPACITY;

      for (int s = 0; s < STEPS; s++) {
        test_util::create_continuous_keys<K, S, V, DIM>(
            h_keys_base, h_scores_base, h_vectors_base, BATCH_SIZE, start_key);
        start_key += BATCH_SIZE;

        CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base, BATCH_SIZE * sizeof(K),
                              cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base,
                              BATCH_SIZE * sizeof(S), cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base,
                              BATCH_SIZE * sizeof(V) * options.dim,
                              cudaMemcpyHostToDevice));
        table->assign(BATCH_SIZE, d_keys_temp, d_scores_temp, stream);
        table->find_or_insert(BATCH_SIZE, d_keys_temp, d_vectors_temp,
                              d_scores_temp, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
      }

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_GE(total_size, expected_table_size);
      ASSERT_EQ(MAX_CAPACITY, table->capacity());

      size_t dump_counter = table->export_batch(
          MAX_CAPACITY, 0, d_keys_temp, d_vectors_temp, d_scores_temp, stream);

      CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys_temp, MAX_CAPACITY * sizeof(K),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_scores_temp, d_scores_temp,
                            MAX_CAPACITY * sizeof(S), cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors_temp,
                            MAX_CAPACITY * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));

      size_t bigger_score_counter = 0;
      K max_key = 0;
      size_t values_error_counter = 0;
      for (int i = 0; i < dump_counter; i++) {
        ASSERT_EQ(h_keys_temp[i], h_scores_temp[i]);
        max_key = std::max(max_key, h_keys_temp[i]);
        if (h_scores_temp[i] >= expected_min_key) bigger_score_counter++;
        for (int j = 0; j < options.dim; j++) {
          if (h_vectors_temp[i * options.dim + j] !=
              static_cast<float>(h_keys_temp[i] * 0.00001)) {
            values_error_counter++;
          }
        }
      }

      ASSERT_EQ(values_error_counter, 0);
      float correct_rate = (bigger_score_counter * 1.0) / MAX_CAPACITY;
      std::cout << std::setprecision(3) << "[Round " << r << "]"
                << "correct_rate=" << correct_rate << std::endl;
      ASSERT_GE(max_key, expected_max_key);
      ASSERT_GE(correct_rate, expected_correct_rate);
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

template <typename K, typename V, typename S, typename Table, size_t dim = 64>
void CheckFindOrInsertValues(Table* table, K* keys, V* values, S* scores,
                             size_t len, cudaStream_t stream) {
  std::map<K, test_util::ValueArray<V, dim>> map_before_insert;
  std::map<K, test_util::ValueArray<V, dim>> map_after_insert;
  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;

  size_t table_size_before = table->size(stream);
  size_t cap = table_size_before + len;

  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));
  h_tmp_keys = (K*)malloc(cap * sizeof(K));
  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
  h_tmp_scores = (S*)malloc(cap * sizeof(S));

  size_t table_size_verify0 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);
  ASSERT_EQ(table_size_before, table_size_verify0);

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_before * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_before * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_before * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < table_size_verify0; i++) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    map_before_insert[h_tmp_keys[i]] = *vec;
  }

  auto start = std::chrono::steady_clock::now();
  table->find_or_insert(len, keys, values, nullptr, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::steady_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

  float dur = diff.count();

  size_t table_size_after = table->size(stream);
  size_t table_size_verify1 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

  ASSERT_EQ(table_size_verify1, table_size_after);

  size_t new_cap = table_size_after;
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_after * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_after * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_after * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int64_t new_cap_K = (int64_t)new_cap;
  for (int64_t i = new_cap_K - 1; i >= 0; i--) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    map_after_insert[h_tmp_keys[i]] = *vec;
  }

  size_t value_diff_cnt = 0;
  for (auto& it : map_after_insert) {
    test_util::ValueArray<V, dim>& vec = map_after_insert.at(it.first);
    for (size_t j = 0; j < dim; j++) {
      if (vec[j] != static_cast<float>(it.first * 0.00001)) {
        ++value_diff_cnt;
        break;
      }
    }
  }
  ASSERT_EQ(value_diff_cnt, 0);
  std::cout << "Check find_or_insert behavior got "
            << "value_diff_cnt: " << value_diff_cnt
            << ", while table_size_before: " << table_size_before
            << ", while table_size_after: " << table_size_after
            << ", while len: " << len << std::endl;

  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
  free(h_tmp_keys);
  free(h_tmp_values);
  free(h_tmp_scores);
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

void test_find_or_insert_values_check(size_t max_hbm_for_vectors) {
  const size_t U = 524288;
  const size_t init_capacity = 1024;
  const size_t B = 524288 + 13;
  constexpr size_t dim = 64;

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;
  opt.dim = 64;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<K, V, S> data_buffer;
  data_buffer.Reserve(B, dim, stream);

  size_t offset = 0;
  S score = 0;
  for (int i = 0; i < 20; i++) {
    test_util::create_random_keys<K, S, V, dim>(
        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),
        data_buffer.values_ptr(false), (int)B, B * 16);
    data_buffer.SyncData(true, stream);

    CheckFindOrInsertValues<K, V, S, Table, dim>(
        table.get(), data_buffer.keys_ptr(), data_buffer.values_ptr(),
        data_buffer.scores_ptr(), B, stream);

    offset += B;
    score += 1;
  }
}

TEST(AssignScoreTest, test_evict_strategy_lru_basic) {
  test_evict_strategy_lru_basic(16);
  test_evict_strategy_lru_basic(0, 34);
}
TEST(AssignScoreTest, test_evict_strategy_lfu_basic) {
  test_evict_strategy_lfu_basic(16);
  test_evict_strategy_lfu_basic(0, 2);
}
TEST(AssignScoreTest, test_evict_strategy_epochlru_basic) {
  test_evict_strategy_epochlru_basic(16, 51);
  test_evict_strategy_epochlru_basic(0);
}
TEST(AssignScoreTest, test_evict_strategy_epochlfu_basic) {
  test_evict_strategy_epochlfu_basic(16, 4);
  test_evict_strategy_epochlfu_basic(0);
}
TEST(AssignScoreTest, test_evict_strategy_customized_basic) {
  test_evict_strategy_customized_basic(16);
  test_evict_strategy_customized_basic(0, 11);
}
TEST(AssignScoreTest, test_evict_strategy_customized_advanced) {
  test_evict_strategy_customized_advanced(16, 33);
  test_evict_strategy_customized_advanced(0);
}
TEST(AssignScoreTest, test_assign_advanced_on_epochlfu) {
  test_assign_advanced_on_epochlfu(16);
}
TEST(AssignScoreTest, test_evict_strategy_customized_correct_rate) {
  // TODO(rhdong): after blossom CI issue is resolved, the skip logic.
  const bool skip_hmem_check = (nullptr != std::getenv("IS_BLOSSOM_CI"));
  test_evict_strategy_customized_correct_rate(16, 44);
  if (!skip_hmem_check) {
    test_evict_strategy_customized_correct_rate(0);
  } else {
    std::cout << "The HMEM check is skipped in blossom CI!" << std::endl;
  }
}

================================================
FILE: tests/assign_values_test.cc.cu
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * test API: assign_values
 */

#include <gtest/gtest.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <array>
#include <iostream>
#include <thread>
#include <vector>
#include "merlin_hashtable.cuh"
#include "test_util.cuh"

constexpr size_t DIM = 16;
using K = uint64_t;
using V = float;
using S = uint64_t;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;

void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors,
                                   int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] =
        static_cast<float>(h_keys_base[72] * 0.00002);
    h_vectors_test[3 * options.dim + i] =
        static_cast<float>(h_keys_base[73] * 0.00002);
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      S start_ts = test_util::host_nano<S>(stream);
      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,
                            stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts = test_util::host_nano<S>(stream);

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      ASSERT_GE(h_scores_temp_sorted[0], start_ts);
      ASSERT_LE(h_scores_temp_sorted[TEST_KEY_NUM - 1], end_ts);
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));

      S start_ts = test_util::host_nano<S>(stream);
      table->assign_values(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        V expected_v = (h_keys_temp[i] == h_keys_test[2] ||
                        h_keys_temp[i] == h_keys_test[3])
                           ? static_cast<V>(h_keys_temp[i] * 0.00002)
                           : static_cast<V>(h_keys_temp[i] * 0.00001);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j], expected_v);
        }
        ASSERT_LE(h_scores_temp[i], start_ts);
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors,
                                        int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  int freq_range = 1000;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,
      freq_range);

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD, freq_range);

  // Simulate overflow of low 32bits.
  h_scores_base[71] = static_cast<S>(std::numeric_limits<uint32_t>::max() -
                                     static_cast<uint32_t>(1));

  h_keys_test[1] = h_keys_base[71];
  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  h_scores_test[1] = h_scores_base[71];
  h_scores_test[2] = h_keys_base[72] % freq_range;
  h_scores_test[3] = h_keys_base[73] % freq_range;

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[1 * options.dim + i] =
        static_cast<float>(h_keys_base[71] * 0.00002);
    h_vectors_test[2 * options.dim + i] =
        static_cast<float>(h_keys_base[72] * 0.00002);
    h_vectors_test[3 * options.dim + i] =
        static_cast<float>(h_keys_base[73] * 0.00002);
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->set_global_epoch(global_epoch);
      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        if (h_keys_temp[i] == h_keys_base[71]) {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, h_scores_base[71]);
          ASSERT_EQ(h_scores_temp[i], expected_score);
        } else {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, (h_keys_temp[i] % freq_range));
          ASSERT_EQ(h_scores_temp[i], expected_score);
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->assign_values(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      ASSERT_TRUE(h_keys_temp.end() != std::find(h_keys_temp.begin(),
                                                 h_keys_temp.end(),
                                                 h_keys_base[71]));

      for (int i = 0; i < dump_counter; i++) {
        if (h_keys_temp[i] == h_keys_base[71]) {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, h_scores_base[71]);
          ASSERT_EQ(h_scores_temp[i], expected_score);
        } else {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, (h_keys_temp[i] % freq_range));
          ASSERT_EQ(h_scores_temp[i], expected_score);
        }
        for (int j = 0; j < options.dim; j++) {
          V expected_v = (h_keys_temp[i] == h_keys_test[1] ||
                          h_keys_temp[i] == h_keys_test[2] ||
                          h_keys_temp[i] == h_keys_test[3])
                             ? static_cast<V>(h_keys_temp[i] * 0.00002)
                             : static_cast<V>(h_keys_temp[i] * 0.00001);
          ASSERT_EQ(h_vectors_temp[i * options.dim + j], expected_v);
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

template <typename K, typename V, typename S, typename Table, size_t dim = 64>
void CheckAssignOnEpochLfu(Table* table,
                           test_util::KVMSBuffer<K, V, S>* data_buffer,
                           test_util::KVMSBuffer<K, V, S>* evict_buffer,
                           test_util::KVMSBuffer<K, V, S>* pre_data_buffer,
                           size_t len, cudaStream_t stream, TableOptions& opt,
                           unsigned int global_epoch) {
  std::map<K, test_util::ValueArray<V, dim>> values_map_before_insert;
  std::map<K, test_util::ValueArray<V, dim>> values_map_after_insert;

  std::unordered_map<K, S> scores_map_before_insert;
  std::map<K, S> scores_map_after_insert;

  std::map<K, S> scores_map_current_batch;
  std::map<K, S> scores_map_current_evict;

  K* keys = data_buffer->keys_ptr();
  V* values = data_buffer->values_ptr();
  S* scores = data_buffer->scores_ptr();

  K* evicted_keys = evict_buffer->keys_ptr();
  V* evicted_values = evict_buffer->values_ptr();
  S* evicted_scores = evict_buffer->scores_ptr();

  for (size_t i = 0; i < len; i++) {
    scores_map_current_batch[data_buffer->keys_ptr(false)[i]] =
        data_buffer->scores_ptr(false)[i];
  }

  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;
  bool* h_tmp_founds = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;
  bool* d_tmp_founds = nullptr;

  size_t table_size_before = table->size(stream);
  size_t cap = table_size_before + len;

  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));
  h_tmp_keys = (K*)malloc(cap * sizeof(K));
  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
  h_tmp_scores = (S*)malloc(cap * sizeof(S));
  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));

  size_t table_size_verify0 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);
  ASSERT_EQ(table_size_before, table_size_verify0);

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_before * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_before * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_before * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys,
                             len * sizeof(K), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim, values,
                             len * dim * sizeof(V), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before, scores,
                             len * sizeof(S), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < cap; i++) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    values_map_before_insert[h_tmp_keys[i]] = *vec;
  }

  for (size_t i = 0; i < table_size_before; i++) {
    scores_map_before_insert[h_tmp_keys[i]] = h_tmp_scores[i];
  }

  table->assign_values(len, keys, values, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  {
    size_t table_size_verify1 = table->export_batch(
        table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

    CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                               table_size_before * sizeof(K),
                               cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                               table_size_before * dim * sizeof(V),
                               cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                               table_size_before * sizeof(S),
                               cudaMemcpyDeviceToHost, stream));

    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(table_size_verify1, table_size_before);

    size_t score_error_cnt = 0;

    for (int64_t i = table_size_before - 1; i >= 0; i--) {
      test_util::ValueArray<V, dim>* vec =
          reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                           i * dim);
      values_map_after_insert[h_tmp_keys[i]] = *vec;
      scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];
    }

    for (auto it : scores_map_current_batch) {
      const K key = it.first;
      const K score = it.second;
      S current_score = scores_map_after_insert[key];
      S score_before_insert = 0;
      if (scores_map_before_insert.find(key) !=
          scores_map_before_insert.end()) {
        score_before_insert = scores_map_before_insert[key];
        bool valid = ((current_score >> 32) < global_epoch) &&
                     ((current_score & 0xFFFFFFFF) ==
                      (0xFFFFFFFF & score_before_insert));

        if (!valid) {
          score_error_cnt++;
        }
      }
    }
    std::cout << "Check assign behavior got "
              << ", score_error_cnt: " << score_error_cnt
              << ", while len: " << len << std::endl;
    ASSERT_EQ(score_error_cnt, 0);
  }

  for (int64_t i = 0; i < table_size_before; i++) {
    values_map_before_insert[h_tmp_keys[i]] =
        values_map_after_insert[h_tmp_keys[i]];
    scores_map_before_insert[h_tmp_keys[i]] =
        scores_map_after_insert[h_tmp_keys[i]];
  }
  values_map_after_insert.clear();
  scores_map_after_insert.clear();

  table->set_global_epoch(global_epoch);
  auto start = std::chrono::steady_clock::now();
  size_t filtered_len = table->insert_and_evict(
      len, keys, values,
      (Table::evict_strategy == EvictStrategy::kLru ||
       Table::evict_strategy == EvictStrategy::kEpochLru)
          ? nullptr
          : scores,
      evicted_keys, evicted_values, evicted_scores, stream);
  evict_buffer->SyncData(false, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::steady_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

  for (size_t i = 0; i < filtered_len; i++) {
    scores_map_current_evict[evict_buffer->keys_ptr(false)[i]] =
        evict_buffer->scores_ptr(false)[i];
  }

  float dur = diff.count();

  size_t table_size_after = table->size(stream);
  size_t table_size_verify1 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

  ASSERT_EQ(table_size_verify1, table_size_after);

  size_t new_cap = table_size_after + filtered_len;
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_after * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_after * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_after * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,
                             filtered_len * sizeof(K), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,
                             evicted_values, filtered_len * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after, evicted_scores,
                             filtered_len * sizeof(S), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  size_t key_miss_cnt = 0;
  size_t value_diff_cnt = 0;
  size_t score_error_cnt1 = 0;
  size_t score_error_cnt2 = 0;

  for (int64_t i = new_cap - 1; i >= 0; i--) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    values_map_after_insert[h_tmp_keys[i]] = *vec;
    scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];
    if (i >= (new_cap - filtered_len)) {
      bool valid = ((h_tmp_scores[i] >> 32) < (global_epoch - 2));
      if (!valid) {
        score_error_cnt1++;
      }
    }
  }

  for (auto it : scores_map_current_batch) {
    const K key = it.first;
    const K score = it.second;
    S current_score = scores_map_after_insert[key];
    S score_before_insert = 0;
    if (values_map_after_insert.find(key) != values_map_after_insert.end() &&
        scores_map_current_evict.find(key) == scores_map_current_evict.end()) {
      score_before_insert = scores_map_before_insert[key];
    }
    bool valid = ((current_score >> 32) == global_epoch) &&
                 ((current_score & 0xFFFFFFFF) ==
                  ((0xFFFFFFFF & score_before_insert) + (0xFFFFFFFF & score)));

    if (!valid) {
      score_error_cnt2++;
    }
  }

  for (auto& it : values_map_before_insert) {
    if (values_map_after_insert.find(it.first) ==
        values_map_after_insert.end()) {
      ++key_miss_cnt;
      continue;
    }
    test_util::ValueArray<V, dim>& vec0 = it.second;
    test_util::ValueArray<V, dim>& vec1 = values_map_after_insert.at(it.first);
    for (size_t j = 0; j < dim; j++) {
      if (vec0[j] != vec1[j]) {
        ++value_diff_cnt;
        break;
      }
    }
  }

  std::cout << "Check insert_and_evict behavior got "
            << "key_miss_cnt: " << key_miss_cnt
            << ", value_diff_cnt: " << value_diff_cnt
            << ", score_error_cnt1: " << score_error_cnt1
            << ", score_error_cnt2: " << score_error_cnt2
            << ", while table_size_before: " << table_size_before
            << ", while table_size_after: " << table_size_after
            << ", while len: " << len << std::endl;

  ASSERT_EQ(key_miss_cnt, 0);
  ASSERT_EQ(value_diff_cnt, 0);
  ASSERT_EQ(score_error_cnt1, 0);
  ASSERT_EQ(score_error_cnt2, 0);

  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));
  free(h_tmp_keys);
  free(h_tmp_values);
  free(h_tmp_scores);
  free(h_tmp_founds);
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

void test_assign_advanced_on_epochlfu(size_t max_hbm_for_vectors) {
  const size_t U = 1024 * 1024;
  const size_t B = 100000;
  constexpr size_t dim = 16;

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = U;
  opt.max_hbm_for_vectors = U * dim * sizeof(V);
  opt.max_bucket_size = 128;
  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;
  opt.dim = dim;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<K, V, S> evict_buffer;
  evict_buffer.Reserve(B, dim, stream);
  evict_buffer.ToZeros(stream);

  test_util::KVMSBuffer<K, V, S> data_buffer;
  test_util::KVMSBuffer<K, V, S> pre_data_buffer;
  data_buffer.Reserve(B, dim, stream);
  pre_data_buffer.Reserve(B, dim, stream);

  size_t offset = 0;
  int freq_range = 100;
  float repeat_rate = 0.9;
  for (unsigned int global_epoch = 1; global_epoch <= 20; global_epoch++) {
    repeat_rate = global_epoch <= 1 ? 0.0 : 0.1;
    if (global_epoch <= 1) {
      test_util::create_random_keys_advanced<K, S, V>(
          dim, data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),
          data_buffer.values_ptr(false), (int)B, B * 32, freq_range);
    } else {
      test_util::create_random_keys_advanced<K, S, V>(
          dim, data_buffer.keys_ptr(false), pre_data_buffer.keys_ptr(false),
          data_buffer.scores_ptr(false), data_buffer.values_ptr(false), (int)B,
          B * 32, freq_range, repeat_rate);
    }
    data_buffer.SyncData(true, stream);
    if (global_epoch <= 1) {
      pre_data_buffer.CopyFrom(data_buffer, stream);
    }

    CheckAssignOnEpochLfu<K, V, S, Table, dim>(table.get(), &data_buffer,
                                               &evict_buffer, &pre_data_buffer,
                                               B, stream, opt, global_epoch);

    pre_data_buffer.CopyFrom(data_buffer, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    offset += B;
  }
}

TEST(AssignValuesTest, test_evict_strategy_lru_basic) {
  test_evict_strategy_lru_basic(16, 21);
  test_evict_strategy_lru_basic(0);
}
TEST(AssignValuesTest, test_evict_strategy_epochlfu_basic) {
  test_evict_strategy_epochlfu_basic(16);
  test_evict_strategy_epochlfu_basic(0, 8);
}
TEST(AssignValuesTest, test_assign_advanced_on_epochlfu) {
  test_assign_advanced_on_epochlfu(16);
}

================================================
FILE: tests/dual_bucket_test.cc.cu
================================================
/*
 * Copyright (c) 2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <gtest/gtest.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <iostream>
#include <numeric>
#include <random>
#include <unordered_set>
#include <vector>
#include "merlin_hashtable.cuh"
#include "test_util.cuh"

constexpr size_t DIM = 16;
using K = uint64_t;
using V = float;
using S = uint64_t;
using TableOptions = nv::merlin::HashTableOptions;
using TableMode = nv::merlin::TableMode;
using EvictStrategy = nv::merlin::EvictStrategy;

/*
 * Helper: create a MEMORY_MODE table with fixed capacity.
 */
template <typename Table>
void create_memory_mode_table(Table& table, size_t capacity, size_t dim = DIM) {
  TableOptions options;
  options.init_capacity = capacity;
  options.max_capacity = capacity;
  options.max_hbm_for_vectors = 0;
  options.dim = dim;
  options.max_bucket_size = 128;
  options.table_mode = TableMode::kMemory;
  table.init(options);
}

/*
 * Helper: create a THROUGHPUT_MODE table with fixed capacity.
 */
template <typename Table>
void create_throughput_mode_table(Table& table, size_t capacity,
                                  size_t dim = DIM) {
  TableOptions options;
  options.init_capacity = capacity;
  options.max_capacity = capacity;
  options.max_hbm_for_vectors = 0;
  options.dim = dim;
  options.max_bucket_size = 128;
  options.table_mode = TableMode::kThroughput;
  table.init(options);
}

// ==============================
// TestGroup 1: Basic Correctness
// ==============================

// T1.1: MEMORY_MODE insert_or_assign + find basic functionality.
TEST(DualBucketTest, BasicInsertAndFind) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  constexpr size_t CAPACITY = 128 * 1024;  // ~16K entries
  constexpr size_t N = static_cast<size_t>(CAPACITY * 0.5);

  Table table;
  create_memory_mode_table(table, CAPACITY);

  // Allocate host data.
  std::vector<K> h_keys(N);
  std::vector<V> h_values(N * DIM);
  std::vector<S> h_scores(N);

  std::iota(h_keys.begin(), h_keys.end(), 1);
  for (size_t i = 0; i < N; i++) {
    h_scores[i] = i + 1;
    for (size_t j = 0; j < DIM; j++) {
      h_values[i * DIM + j] = static_cast<V>(h_keys[i] * 0.00001f);
    }
  }

  // Allocate device data.
  K* d_keys;
  V* d_values;
  S* d_scores;
  bool* d_founds;
  V* d_found_values;

  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),
                        cudaMemcpyHostToDevice));

  // Insert.
  table.insert_or_assign(N, d_keys, d_values, d_scores, /*stream=*/0,
                         /*unique_key=*/true);
  CUDA_CHECK(cudaDeviceSynchronize());

  // Verify size.
  size_t table_size = table.size(/*stream=*/0);
  CUDA_CHECK(cudaDeviceSynchronize());
  EXPECT_EQ(table_size, N);

  // Find.
  table.find(N, d_keys, d_found_values, d_founds, /*scores=*/nullptr,
             /*stream=*/0);
  CUDA_CHECK(cudaDeviceSynchronize());

  // Check all found.
  bool* h_founds = new bool[N];
  CUDA_CHECK(
      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));
  for (size_t i = 0; i < N; i++) {
    EXPECT_TRUE(h_founds[i]) << "Key " << h_keys[i] << " not found";
  }

  // Check values correct.
  std::vector<V> h_found_values(N * DIM);
  CUDA_CHECK(cudaMemcpy(h_found_values.data(), d_found_values,
                        N * DIM * sizeof(V), cudaMemcpyDeviceToHost));
  for (size_t i = 0; i < N; i++) {
    for (size_t j = 0; j < DIM; j++) {
      EXPECT_FLOAT_EQ(h_found_values[i * DIM + j],
                      static_cast<V>(h_keys[i] * 0.00001f))
          << "Value mismatch for key " << h_keys[i] << " dim " << j;
    }
  }

  delete[] h_founds;
  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_founds));
  CUDA_CHECK(cudaFree(d_found_values));
}

// T1.2: MEMORY_MODE assign (update) - key already exists.
TEST(DualBucketTest, UpdateExistingKey) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  constexpr size_t CAPACITY = 128 * 1024;
  constexpr size_t N = 1024;

  Table table;
  create_memory_mode_table(table, CAPACITY);

  std::vector<K> h_keys(N);
  std::vector<V> h_values_v1(N * DIM);
  std::vector<V> h_values_v2(N * DIM);
  std::vector<S> h_scores(N);

  std::iota(h_keys.begin(), h_keys.end(), 1);
  for (size_t i = 0; i < N; i++) {
    h_scores[i] = i + 1;
    for (size_t j = 0; j < DIM; j++) {
      h_values_v1[i * DIM + j] = 1.0f;
      h_values_v2[i * DIM + j] = 2.0f;
    }
  }

  K* d_keys;
  V* d_values;
  S* d_scores;
  bool* d_founds;
  V* d_found_values;

  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),
                        cudaMemcpyHostToDevice));

  // Insert V1.
  CUDA_CHECK(cudaMemcpy(d_values, h_values_v1.data(), N * DIM * sizeof(V),
                        cudaMemcpyHostToDevice));
  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  // Update with V2.
  CUDA_CHECK(cudaMemcpy(d_values, h_values_v2.data(), N * DIM * sizeof(V),
                        cudaMemcpyHostToDevice));
  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  // Size should still be N (no duplicates).
  size_t table_size = table.size(0);
  CUDA_CHECK(cudaDeviceSynchronize());
  EXPECT_EQ(table_size, N);

  // Find and verify V2 values.
  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  std::vector<V> h_found_values(N * DIM);
  CUDA_CHECK(cudaMemcpy(h_found_values.data(), d_found_values,
                        N * DIM * sizeof(V), cudaMemcpyDeviceToHost));
  for (size_t i = 0; i < N; i++) {
    for (size_t j = 0; j < DIM; j++) {
      EXPECT_FLOAT_EQ(h_found_values[i * DIM + j], 2.0f)
          << "Expected V2 value for key " << h_keys[i] << " dim " << j;
    }
  }

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_founds));
  CUDA_CHECK(cudaFree(d_found_values));
}

// T1.3: MEMORY_MODE score-eviction correctness.
TEST(DualBucketTest, ScoreEviction) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  // Small capacity to force eviction quickly.
  constexpr size_t CAPACITY = 128 * 128;  // 128 buckets * 128 slots = 16384
  constexpr size_t N_FILL = CAPACITY;     // Fill completely
  constexpr size_t N_NEW = 1024;          // Insert high-score keys

  Table table;
  create_memory_mode_table(table, CAPACITY);

  // Phase 1: Fill table with low-score keys.
  std::vector<K> h_keys_fill(N_FILL);
  std::vector<V> h_values_fill(N_FILL * DIM, 1.0f);
  std::vector<S> h_scores_fill(N_FILL);

  std::iota(h_keys_fill.begin(), h_keys_fill.end(), 1);
  for (size_t i = 0; i < N_FILL; i++) {
    h_scores_fill[i] = i + 1;  // Low scores: 1..N_FILL
  }

  K* d_keys;
  V* d_values;
  S* d_scores;
  CUDA_CHECK(cudaMalloc(&d_keys, N_FILL * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, N_FILL * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, N_FILL * sizeof(S)));

  CUDA_CHECK(cudaMemcpy(d_keys, h_keys_fill.data(), N_FILL * sizeof(K),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values_fill.data(),
                        N_FILL * DIM * sizeof(V), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores_fill.data(), N_FILL * sizeof(S),
                        cudaMemcpyHostToDevice));

  table.insert_or_assign(N_FILL, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  // Phase 2: Insert high-score keys (should evict low-score keys).
  std::vector<K> h_keys_new(N_NEW);
  std::vector<V> h_values_new(N_NEW * DIM, 2.0f);
  std::vector<S> h_scores_new(N_NEW);

  for (size_t i = 0; i < N_NEW; i++) {
    h_keys_new[i] = N_FILL + 1 + i;       // New keys
    h_scores_new[i] = N_FILL + 1000 + i;  // High scores
  }

  CUDA_CHECK(cudaMemcpy(d_keys, h_keys_new.data(), N_NEW * sizeof(K),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values_new.data(), N_NEW * DIM * sizeof(V),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores_new.data(), N_NEW * sizeof(S),
                        cudaMemcpyHostToDevice));

  table.insert_or_assign(N_NEW, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  // Phase 3: Verify high-score keys are present.
  bool* d_founds;
  V* d_found_values;
  CUDA_CHECK(cudaMalloc(&d_founds, N_NEW * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found_values, N_NEW * DIM * sizeof(V)));

  CUDA_CHECK(cudaMemcpy(d_keys, h_keys_new.data(), N_NEW * sizeof(K),
                        cudaMemcpyHostToDevice));
  table.find(N_NEW, d_keys, d_found_values, d_founds, nullptr, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  bool* h_founds = new bool[N_NEW];
  CUDA_CHECK(cudaMemcpy(h_founds, d_founds, N_NEW * sizeof(bool),
                        cudaMemcpyDeviceToHost));

  int found_count = 0;
  for (size_t i = 0; i < N_NEW; i++) {
    if (h_founds[i]) found_count++;
  }
  std::cout << "[ScoreEviction] High-score keys accuracy: " << found_count
            << "/" << N_NEW << " (" << (100.0 * found_count / N_NEW) << "%)"
            << std::endl;
  // Most high-score keys should be found.  Require >= 80%.
  EXPECT_GT(found_count, static_cast<int>(N_NEW * 0.8))
      << "Expected >= 80% of high-score keys to survive eviction";

  delete[] h_founds;
  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_founds));
  CUDA_CHECK(cudaFree(d_found_values));
}

// T1.4: THROUGHPUT_MODE regression test (not affected by dual-bucket changes).
TEST(DualBucketTest, ThroughputModeRegression) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  constexpr size_t CAPACITY = 128 * 1024;
  constexpr size_t N = 4096;

  Table table;
  create_throughput_mode_table(table, CAPACITY);

  std::vector<K> h_keys(N);
  std::vector<V> h_values(N * DIM);
  std::vector<S> h_scores(N);

  std::iota(h_keys.begin(), h_keys.end(), 1);
  for (size_t i = 0; i < N; i++) {
    h_scores[i] = i + 1;
    for (size_t j = 0; j < DIM; j++) {
      h_values[i * DIM + j] = static_cast<V>(h_keys[i] * 0.001f);
    }
  }

  K* d_keys;
  V* d_values;
  S* d_scores;
  bool* d_founds;
  V* d_found_values;

  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),
                        cudaMemcpyHostToDevice));

  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  bool* h_founds = new bool[N];
  CUDA_CHECK(
      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));
  for (size_t i = 0; i < N; i++) {
    EXPECT_TRUE(h_founds[i])
        << "THROUGHPUT_MODE: Key " << h_keys[i] << " not found";
  }

  delete[] h_founds;
  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_founds));
  CUDA_CHECK(cudaFree(d_found_values));
}

// ==========================================
// TestGroup 2: Dual-bucket Feature Verify
// ==========================================

// T2.2: First eviction load factor comparison.
TEST(DualBucketTest, FirstEvictionLoadFactor) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  constexpr size_t CAPACITY = 10 * 1024 * 1024;  // ~10M slots

  // Run for MEMORY_MODE.
  {
    Table table;
    create_memory_mode_table(table, CAPACITY);

    constexpr size_t BATCH = 128;
    std::vector<K> h_keys(BATCH);
    std::vector<V> h_values(BATCH * DIM, 1.0f);
    std::vector<S> h_scores(BATCH);

    K* d_keys;
    V* d_values;
    S* d_scores;
    CUDA_CHECK(cudaMalloc(&d_keys, BATCH * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_values, BATCH * DIM * sizeof(V)));
    CUDA_CHECK(cudaMalloc(&d_scores, BATCH * sizeof(S)));

    K next_key = 1;
    size_t total_inserted = 0;
    float first_eviction_lf = 0.0f;

    // Insert in batches until table is nearly full.
    while (total_inserted < CAPACITY) {
      for (size_t i = 0; i < BATCH; i++) {
        h_keys[i] = next_key++;
        h_scores[i] = h_keys[i];  // Score = key value (ascending)
      }
      CUDA_CHECK(cudaMemcpy(d_keys, h_keys.data(), BATCH * sizeof(K),
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), BATCH * DIM * sizeof(V),
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), BATCH * sizeof(S),
                            cudaMemcpyHostToDevice));

      table.insert_or_assign(BATCH, d_keys, d_values, d_scores, 0, true);
      CUDA_CHECK(cudaDeviceSynchronize());
      total_inserted += BATCH;

      size_t table_size = table.size(0);
      CUDA_CHECK(cudaDeviceSynchronize());

      // If table_size < total_inserted, eviction occurred.
      if (table_size < total_inserted && first_eviction_lf == 0.0f) {
        first_eviction_lf =
            static_cast<float>(table_size) / static_cast<float>(CAPACITY);
        break;
      }
    }

    std::cout << "[MEMORY_MODE] First eviction LF: " << first_eviction_lf
              << " (total_inserted=" << total_inserted << ")" << std::endl;

    // Dual-bucket two-choice hashing should achieve very high LF before first
    // eviction.  Empirically measured ~0.982 at 10M scale on A6000.
    EXPECT_GT(first_eviction_lf, 0.980f)
        << "Dual-bucket should delay eviction beyond 98.0% LF";

    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_values));
    CUDA_CHECK(cudaFree(d_scores));
  }
}

// ===================================
// TestGroup 3: API Guard Tests
// ===================================

TEST(DualBucketTest, EraseGuard) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
  Table table;
  create_memory_mode_table(table, 128 * 128);

  K* d_keys;
  CUDA_CHECK(cudaMalloc(&d_keys, sizeof(K)));
  K h_key = 1;
  CUDA_CHECK(cudaMemcpy(d_keys, &h_key, sizeof(K), cudaMemcpyHostToDevice));

  EXPECT_THROW(table.erase(1, d_keys, 0), std::runtime_error);

  CUDA_CHECK(cudaFree(d_keys));
}

TEST(DualBucketTest, ContainsGuard) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
  Table table;
  create_memory_mode_table(table, 128 * 128);

  K* d_keys;
  bool* d_founds;
  CUDA_CHECK(cudaMalloc(&d_keys, sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_founds, sizeof(bool)));
  K h_key = 1;
  CUDA_CHECK(cudaMemcpy(d_keys, &h_key, sizeof(K), cudaMemcpyHostToDevice));

  EXPECT_THROW(table.contains(1, d_keys, d_founds, 0), std::runtime_error);

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_founds));
}

TEST(DualBucketTest, ReserveGuard) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
  Table table;
  create_memory_mode_table(table, 128 * 128);

  EXPECT_THROW(table.reserve(128 * 256, 0), std::runtime_error);
}

// ===================================
// TestGroup 4: Boundary Conditions
// ===================================

// T4.1: Empty table find.
TEST(DualBucketTest, EmptyTableFind) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  Table table;
  create_memory_mode_table(table, 128 * 128);

  constexpr size_t N = 64;
  K* d_keys;
  V* d_values;
  bool* d_founds;
  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));

  std::vector<K> h_keys(N);
  std::iota(h_keys.begin(), h_keys.end(), 1);
  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));

  table.find(N, d_keys, d_values, d_founds, nullptr, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  bool* h_founds = new bool[N];
  CUDA_CHECK(
      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));
  for (size_t i = 0; i < N; i++) {
    EXPECT_FALSE(h_founds[i])
        << "Empty table should not find key " << h_keys[i];
  }

  delete[] h_founds;
  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_founds));
}

// T4.4: Different dim values.
TEST(DualBucketTest, DimVariation) {
  using Table1 = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  // Test dim=1 and dim=64 (exercises different VecV specializations).
  // Note: dim > 224 exceeds the dual-bucket lookup kernel's fixed shared-memory
  // buffer (896 bytes).  init() now rejects dim > 224 for kMemory mode.
  for (size_t test_dim : {1, 64}) {
    Table1 table;
    constexpr size_t CAPACITY = 128 * 128;
    constexpr size_t N = 256;

    TableOptions options;
    options.init_capacity = CAPACITY;
    options.max_capacity = CAPACITY;
    options.max_hbm_for_vectors = 0;
    options.dim = test_dim;
    options.max_bucket_size = 128;
    options.table_mode = TableMode::kMemory;
    table.init(options);

    std::vector<K> h_keys(N);
    std::vector<V> h_values(N * test_dim);
    std::vector<S> h_scores(N);

    std::iota(h_keys.begin(), h_keys.end(), 1);
    for (size_t i = 0; i < N; i++) {
      h_scores[i] = i + 1;
      for (size_t j = 0; j < test_dim; j++) {
        h_values[i * test_dim + j] = static_cast<V>(i);
      }
    }

    K* d_keys;
    V* d_values;
    S* d_scores;
    bool* d_founds;
    V* d_found_values;
    CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_values, N * test_dim * sizeof(V)));
    CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));
    CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));
    CUDA_CHECK(cudaMalloc(&d_found_values, N * test_dim * sizeof(V)));

    CUDA_CHECK(cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * test_dim * sizeof(V),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),
                          cudaMemcpyHostToDevice));

    table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);
    CUDA_CHECK(cudaDeviceSynchronize());

    table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);
    CUDA_CHECK(cudaDeviceSynchronize());

    bool* h_founds = new bool[N];
    CUDA_CHECK(cudaMemcpy(h_founds, d_founds, N * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    std::vector<V> h_found_values(N * test_dim);
    CUDA_CHECK(cudaMemcpy(h_found_values.data(), d_found_values,
                          N * test_dim * sizeof(V), cudaMemcpyDeviceToHost));
    for (size_t i = 0; i < N; i++) {
      EXPECT_TRUE(h_founds[i])
          << "dim=" << test_dim << ": Key " << h_keys[i] << " not found";
      if (h_founds[i]) {
        for (size_t j = 0; j < test_dim; j++) {
          EXPECT_FLOAT_EQ(h_found_values[i * test_dim + j], static_cast<V>(i))
              << "dim=" << test_dim << ": Value mismatch key " << h_keys[i]
              << " dim " << j;
        }
      }
    }

    delete[] h_founds;
    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_values));
    CUDA_CHECK(cudaFree(d_scores));
    CUDA_CHECK(cudaFree(d_founds));
    CUDA_CHECK(cudaFree(d_found_values));
  }
}

// ===================================
// TestGroup 5: Init Validation
// ===================================

TEST(DualBucketTest, InitCapacityMismatchReject) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
  Table table;

  TableOptions options;
  options.init_capacity = 128 * 128;
  options.max_capacity = 128 * 256;  // Different from init_capacity!
  options.max_hbm_for_vectors = 0;
  options.dim = DIM;
  options.max_bucket_size = 128;
  options.table_mode = TableMode::kMemory;

  EXPECT_THROW(table.init(options), std::runtime_error);
}

// ===================================
// TestGroup 2 additions
// ===================================

// T2.3: b1 == b2 degeneration.
// When a key's two bucket indices collide, the kernel must degenerate to
// single-bucket behaviour without data corruption or deadlock.
TEST(DualBucketTest, B1EqualsB2Degeneration) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  // Use a small number of buckets so that collisions of lo%N == hi%N are
  // reasonably frequent.  With 4 buckets the probability for each key is ~25%.
  constexpr size_t NUM_BUCKETS = 4;
  constexpr size_t CAPACITY = NUM_BUCKETS * 128;  // 512 slots
  constexpr size_t N = 256;

  Table table;
  create_memory_mode_table(table, CAPACITY);

  std::vector<K> h_keys(N);
  std::vector<V> h_values(N * DIM);
  std::vector<S> h_scores(N);

  std::iota(h_keys.begin(), h_keys.end(), 1);
  for (size_t i = 0; i < N; i++) {
    h_scores[i] = i + 1;
    for (size_t j = 0; j < DIM; j++)
      h_values[i * DIM + j] = static_cast<V>(h_keys[i]);
  }

  K* d_keys;
  V* d_values;
  S* d_scores;
  bool* d_founds;
  V* d_found_values;
  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),
                        cudaMemcpyHostToDevice));

  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  // All N keys should be found, regardless of b1==b2 collisions.
  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  bool* h_founds = new bool[N];
  CUDA_CHECK(
      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));
  std::vector<V> h_found_values(N * DIM);
  CUDA_CHECK(cudaMemcpy(h_found_values.data(), d_found_values,
                        N * DIM * sizeof(V), cudaMemcpyDeviceToHost));

  for (size_t i = 0; i < N; i++) {
    EXPECT_TRUE(h_founds[i]) << "Key " << h_keys[i] << " not found (b1==b2?)";
    if (h_founds[i]) {
      EXPECT_FLOAT_EQ(h_found_values[i * DIM], static_cast<V>(h_keys[i]))
          << "Value mismatch for key " << h_keys[i];
    }
  }

  // Table size must equal N (no duplicates from b1==b2 path).
  size_t table_size = table.size(0);
  CUDA_CHECK(cudaDeviceSynchronize());
  EXPECT_EQ(table_size, N);

  delete[] h_founds;
  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_founds));
  CUDA_CHECK(cudaFree(d_found_values));
}

// T2.5: Digest effectiveness — verify that dual-bucket digest (bit[56:63])
// is used consistently during init, insert, and find.  If the init kernel
// wrote the wrong empty-digest value, empty-slot detection would fail and
// no keys could be inserted.  This test therefore doubles as a regression
// guard for the G1 digest-mismatch bug.
TEST(DualBucketTest, DigestEffectiveness) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  constexpr size_t CAPACITY = 128 * 64;  // 8192 slots, 64 buckets
  constexpr size_t N = 4096;             // 50% LF

  Table table;
  create_memory_mode_table(table, CAPACITY);

  std::vector<K> h_keys(N);
  std::vector<V> h_values(N * DIM, 1.0f);
  std::vector<S> h_scores(N);

  // Use random keys so that digests are well-distributed.
  std::mt19937_64 rng(42);
  for (size_t i = 0; i < N; i++) {
    h_keys[i] = (rng() & 0x00FFFFFFFFFFFFFF) | 1;  // avoid reserved keys
    h_scores[i] = i + 1;
  }

  K* d_keys;
  V* d_values;
  S* d_scores;
  bool* d_founds;
  V* d_found_values;
  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),
                        cudaMemcpyHostToDevice));

  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  // If empty-digest was wrong, insert would have gone through the D2 eviction
  // path and all entries would be REFUSED.  Check that table is not empty.
  size_t table_size = table.size(0);
  CUDA_CHECK(cudaDeviceSynchronize());
  EXPECT_EQ(table_size, N) << "Digest mismatch: expected " << N
                           << " entries but got " << table_size
                           << " (empty-slot detection likely failed)";

  // Verify every key is findable.
  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  bool* h_founds = new bool[N];
  CUDA_CHECK(
      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));
  int found_count = 0;
  for (size_t i = 0; i < N; i++) {
    if (h_founds[i]) found_count++;
  }
  EXPECT_EQ(found_count, static_cast<int>(N))
      << "Digest mismatch on find: only " << found_count << "/" << N
      << " keys found";

  delete[] h_founds;
  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_founds));
  CUDA_CHECK(cudaFree(d_found_values));
}

// ===================================
// TestGroup 1 addition: Score ordering after eviction
// ===================================

// T1.3b: After eviction, surviving keys must have scores >= the scores of
// evicted keys.  We export the full table and verify score ordering.
TEST(DualBucketTest, ScoreOrderingAfterEviction) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  constexpr size_t CAPACITY = 128 * 64;  // 8192 slots
  constexpr size_t N_FILL = CAPACITY;
  constexpr size_t N_NEW = 512;

  Table table;
  create_memory_mode_table(table, CAPACITY);

  // Phase 1: Fill with scores [1..N_FILL].
  std::vector<K> h_keys(N_FILL);
  std::vector<V> h_values(N_FILL * DIM, 1.0f);
  std::vector<S> h_scores(N_FILL);

  std::iota(h_keys.begin(), h_keys.end(), 1);
  for (size_t i = 0; i < N_FILL; i++) h_scores[i] = i + 1;

  K* d_keys;
  V* d_values;
  S* d_scores;
  CUDA_CHECK(cudaMalloc(&d_keys, N_FILL * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, N_FILL * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, N_FILL * sizeof(S)));

  CUDA_CHECK(cudaMemcpy(d_keys, h_keys.data(), N_FILL * sizeof(K),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N_FILL * DIM * sizeof(V),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N_FILL * sizeof(S),
                        cudaMemcpyHostToDevice));

  table.insert_or_assign(N_FILL, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  // Phase 2: Insert high-score keys that force eviction.
  std::vector<K> h_keys_new(N_NEW);
  std::vector<V> h_values_new(N_NEW * DIM, 2.0f);
  std::vector<S> h_scores_new(N_NEW);
  for (size_t i = 0; i < N_NEW; i++) {
    h_keys_new[i] = N_FILL + 1 + i;
    h_scores_new[i] = N_FILL * 10 + i;  // Much higher scores
  }

  CUDA_CHECK(cudaMemcpy(d_keys, h_keys_new.data(), N_NEW * sizeof(K),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values_new.data(), N_NEW * DIM * sizeof(V),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores_new.data(), N_NEW * sizeof(S),
                        cudaMemcpyHostToDevice));

  table.insert_or_assign(N_NEW, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  // Phase 3: Export all surviving entries and check scores.
  size_t table_size = table.size(0);
  CUDA_CHECK(cudaDeviceSynchronize());

  K* d_dump_keys;
  V* d_dump_values;
  S* d_dump_scores;
  size_t* d_dump_counter;
  CUDA_CHECK(cudaMalloc(&d_dump_keys, table_size * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_dump_values, table_size * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_dump_scores, table_size * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));
  CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t)));

  table.export_batch(table_size, 0, d_dump_counter, d_dump_keys, d_dump_values,
                     d_dump_scores, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  size_t dumped;
  CUDA_CHECK(cudaMemcpy(&dumped, d_dump_counter, sizeof(size_t),
                        cudaMemcpyDeviceToHost));

  std::vector<S> h_dump_scores(dumped);
  CUDA_CHECK(cudaMemcpy(h_dump_scores.data(), d_dump_scores, dumped * sizeof(S),
                        cudaMemcpyDeviceToHost));

  // Find the minimum score among all surviving entries.
  S min_surviving =
      *std::min_element(h_dump_scores.begin(), h_dump_scores.end());

  // Check that all high-score keys that were inserted have scores above
  // the surviving minimum.  (Some high-score keys may have been REFUSED,
  // but if they ARE in the table, their score must be consistent.)
  std::vector<K> h_dump_keys(dumped);
  CUDA_CHECK(cudaMemcpy(h_dump_keys.data(), d_dump_keys, dumped * sizeof(K),
                        cudaMemcpyDeviceToHost));

  int high_score_survivors = 0;
  for (size_t i = 0; i < dumped; i++) {
    if (h_dump_keys[i] > N_FILL) {
      high_score_survivors++;
      // Every high-score key should have score >= min_surviving.
      EXPECT_GE(h_dump_scores[i], min_surviving);
    }
  }
  // At least some high-score keys should have survived.
  EXPECT_GT(high_score_survivors, 0) << "No high-score keys survived eviction";

  std::cout << "[ScoreOrdering] min_surviving_score=" << min_surviving
            << " high_score_survivors=" << high_score_survivors << "/" << N_NEW
            << std::endl;

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_dump_keys));
  CUDA_CHECK(cudaFree(d_dump_values));
  CUDA_CHECK(cudaFree(d_dump_scores));
  CUDA_CHECK(cudaFree(d_dump_counter));
}

// ===================================
// TestGroup 3 additions: API Guard Tests (new)
// ===================================

TEST(DualBucketTest, FindOrInsertGuard) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
  Table table;
  create_memory_mode_table(table, 128 * 128);

  K* d_keys;
  V* d_values;
  S* d_scores;
  CUDA_CHECK(cudaMalloc(&d_keys, sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, sizeof(S)));

  EXPECT_THROW(table.find_or_insert(1, d_keys, d_values, d_scores, 0, true),
               std::runtime_error);

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
}

TEST(DualBucketTest, InsertAndEvictGuard) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
  Table table;
  create_memory_mode_table(table, 128 * 128);

  K* d_keys;
  V* d_values;
  S* d_scores;
  K* d_evicted_keys;
  V* d_evicted_values;
  S* d_evicted_scores;
  size_t* d_counter;
  CUDA_CHECK(cudaMalloc(&d_keys, sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_evicted_keys, sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_evicted_values, DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_evicted_scores, sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_counter, sizeof(size_t)));

  EXPECT_THROW(
      table.insert_and_evict(1, d_keys, d_values, d_scores, d_evicted_keys,
                             d_evicted_values, d_evicted_scores, d_counter, 0),
      std::runtime_error);

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_evicted_keys));
  CUDA_CHECK(cudaFree(d_evicted_values));
  CUDA_CHECK(cudaFree(d_evicted_scores));
  CUDA_CHECK(cudaFree(d_counter));
}

TEST(DualBucketTest, AccumOrAssignGuard) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
  Table table;
  create_memory_mode_table(table, 128 * 128);

  K* d_keys;
  V* d_values;
  bool* d_accum;
  CUDA_CHECK(cudaMalloc(&d_keys, sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_accum, sizeof(bool)));

  EXPECT_THROW(table.accum_or_assign(1, d_keys, d_values, d_accum),
               std::runtime_error);

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_accum));
}

TEST(DualBucketTest, AssignScoresGuard) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
  Table table;
  create_memory_mode_table(table, 128 * 128);

  K* d_keys;
  S* d_scores;
  CUDA_CHECK(cudaMalloc(&d_keys, sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, sizeof(S)));

  EXPECT_THROW(table.assign_scores(1, d_keys, d_scores), std::runtime_error);

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
}

TEST(DualBucketTest, AssignValuesGuard) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
  Table table;
  create_memory_mode_table(table, 128 * 128);

  K* d_keys;
  V* d_values;
  CUDA_CHECK(cudaMalloc(&d_keys, sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, DIM * sizeof(V)));

  EXPECT_THROW(table.assign_values(1, d_keys, d_values), std::runtime_error);

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
}

// ===================================
// TestGroup 5 addition: max_hbm_for_vectors rejection
// ===================================

TEST(DualBucketTest, InitHbmForVectorsReject) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
  Table table;

  TableOptions options;
  options.init_capacity = 128 * 128;
  options.max_capacity = 128 * 128;
  options.max_hbm_for_vectors = 1024;  // non-zero → should be rejected
  options.dim = DIM;
  options.max_bucket_size = 128;
  options.table_mode = TableMode::kMemory;

  EXPECT_THROW(table.init(options), std::runtime_error);
}

// T5.3: dim > 224 rejected in MEMORY_MODE (shared-memory buffer overflow).
TEST(DualBucketTest, InitDimTooLargeReject) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
  Table table;

  TableOptions options;
  options.init_capacity = 128 * 128;
  options.max_capacity = 128 * 128;
  options.max_hbm_for_vectors = 0;
  options.dim = 256;  // exceeds 224-float limit
  options.max_bucket_size = 128;
  options.table_mode = TableMode::kMemory;

  EXPECT_THROW(table.init(options), std::runtime_error);
}

// T5.3b: dim=224 should be accepted (exact boundary).
TEST(DualBucketTest, InitDimMaxAccepted) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
  Table table;

  TableOptions options;
  options.init_capacity = 128 * 128;
  options.max_capacity = 128 * 128;
  options.max_hbm_for_vectors = 0;
  options.dim = 224;  // exactly at the limit
  options.max_bucket_size = 128;
  options.table_mode = TableMode::kMemory;

  EXPECT_NO_THROW(table.init(options));
}

// ===================================
// TestGroup 2 addition: Bucket distribution (T2.1)
// ===================================

// Verify that keys are distributed across multiple buckets (not all in b1).
// We insert random keys and check that after export, the table size matches
// expectations.  A more direct check would require bucket-level introspection
// which the public API does not expose, but we can infer distribution by
// checking that the first-eviction LF is significantly higher than single-
// bucket mode (covered in FirstEvictionLoadFactor).  Here we do a simple
// idempotency + size check with random keys to stress the hash distribution.
TEST(DualBucketTest, RandomKeyDistribution) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  constexpr size_t CAPACITY = 128 * 128;  // 16384 slots
  constexpr size_t N = 8192;              // 50% LF

  Table table;
  create_memory_mode_table(table, CAPACITY);

  std::vector<K> h_keys(N);
  std::vector<V> h_values(N * DIM, 1.0f);
  std::vector<S> h_scores(N);

  std::mt19937_64 rng(12345);
  for (size_t i = 0; i < N; i++) {
    h_keys[i] = (rng() & 0x00FFFFFFFFFFFFFF) | 1;
    h_scores[i] = i + 1;
  }

  K* d_keys;
  V* d_values;
  S* d_scores;
  bool* d_founds;
  V* d_found_values;
  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),
                        cudaMemcpyHostToDevice));

  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  size_t table_size = table.size(0);
  CUDA_CHECK(cudaDeviceSynchronize());
  EXPECT_EQ(table_size, N)
      << "Random keys at 50% LF should all be inserted without eviction";

  // Re-insert the same keys (idempotent).
  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  size_t table_size_after = table.size(0);
  CUDA_CHECK(cudaDeviceSynchronize());
  EXPECT_EQ(table_size_after, N) << "Re-insert must not create duplicates";

  // Find all.
  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  bool* h_founds = new bool[N];
  CUDA_CHECK(
      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));
  for (size_t i = 0; i < N; i++) {
    EXPECT_TRUE(h_founds[i]) << "Random key " << h_keys[i] << " not found";
  }

  delete[] h_founds;
  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_founds));
  CUDA_CHECK(cudaFree(d_found_values));
}

// ===================================
// TestGroup 4 addition: Single bucket capacity (T4.2)
// ===================================

// T4.2: Single-bucket capacity must be rejected by MEMORY_MODE init guard.
// Dual-bucket addressing requires at least 2 buckets (capacity >= 256).
TEST(DualBucketTest, SingleBucketCapacityRejected) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;
  Table table;

  // 1 bucket = 128 slots → must be rejected.
  EXPECT_THROW(create_memory_mode_table(table, 128), std::runtime_error);
}

// T4.2b: Minimum valid capacity (2 buckets = 256 slots).
TEST(DualBucketTest, MinimumTwoBucketCapacity) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  constexpr size_t CAPACITY = 256;  // 2 buckets
  constexpr size_t N = 128;

  Table table;
  create_memory_mode_table(table, CAPACITY);

  std::vector<K> h_keys(N);
  std::vector<V> h_values(N * DIM, 1.0f);
  std::vector<S> h_scores(N);

  std::iota(h_keys.begin(), h_keys.end(), 1);
  for (size_t i = 0; i < N; i++) h_scores[i] = i + 1;

  K* d_keys;
  V* d_values;
  S* d_scores;
  bool* d_founds;
  V* d_found_values;
  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),
                        cudaMemcpyHostToDevice));

  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  size_t table_size = table.size(0);
  CUDA_CHECK(cudaDeviceSynchronize());
  EXPECT_EQ(table_size, N);

  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  bool* h_founds = new bool[N];
  CUDA_CHECK(
      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));
  for (size_t i = 0; i < N; i++) {
    EXPECT_TRUE(h_founds[i])
        << "Two-bucket: Key " << h_keys[i] << " not found";
  }

  delete[] h_founds;
  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_founds));
  CUDA_CHECK(cudaFree(d_found_values));
}

// ===================================
// DEBUG: 2-bucket eviction trace
// ===================================

// Small-scale eviction test with kernel printf enabled (buckets_num <= 4).
// Fill 2 buckets (256 slots), then insert 4 high-score keys and trace D2.
TEST(DualBucketTest, DebugEvictionTrace) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  constexpr size_t NUM_BUCKETS = 2;
  constexpr size_t CAPACITY = NUM_BUCKETS * 128;  // 256 slots
  constexpr size_t N_FILL = CAPACITY;             // Fill completely
  constexpr size_t N_NEW = 4;  // Insert a few high-score keys

  Table table;
  create_memory_mode_table(table, CAPACITY);

  // Phase 1: Fill with scores 1..256.
  std::vector<K> h_keys_fill(N_FILL);
  std::vector<V> h_values_fill(N_FILL * DIM, 1.0f);
  std::vector<S> h_scores_fill(N_FILL);

  std::iota(h_keys_fill.begin(), h_keys_fill.end(), 1);
  for (size_t i = 0; i < N_FILL; i++) {
    h_scores_fill[i] = i + 1;
  }

  K* d_keys;
  V* d_values;
  S* d_scores;
  CUDA_CHECK(cudaMalloc(&d_keys, N_FILL * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, N_FILL * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, N_FILL * sizeof(S)));

  CUDA_CHECK(cudaMemcpy(d_keys, h_keys_fill.data(), N_FILL * sizeof(K),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values_fill.data(),
                        N_FILL * DIM * sizeof(V), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores_fill.data(), N_FILL * sizeof(S),
                        cudaMemcpyHostToDevice));

  table.insert_or_assign(N_FILL, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  size_t table_size_after_fill = table.size(0);
  CUDA_CHECK(cudaDeviceSynchronize());
  std::cout << "[DebugEviction] After fill: table_size="
            << table_size_after_fill << " capacity=" << CAPACITY << std::endl;

  // Verify fill: find all N_FILL keys to check b2 lookup correctness.
  {
    bool* d_fill_founds;
    V* d_fill_found_vals;
    CUDA_CHECK(cudaMalloc(&d_fill_founds, N_FILL * sizeof(bool)));
    CUDA_CHECK(cudaMalloc(&d_fill_found_vals, N_FILL * DIM * sizeof(V)));
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys_fill.data(), N_FILL * sizeof(K),
                          cudaMemcpyHostToDevice));
    table.find(N_FILL, d_keys, d_fill_found_vals, d_fill_founds, nullptr, 0);
    CUDA_CHECK(cudaDeviceSynchronize());
    bool* h_fill_founds = new bool[N_FILL];
    CUDA_CHECK(cudaMemcpy(h_fill_founds, d_fill_founds, N_FILL * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    int fill_found = 0;
    for (size_t i = 0; i < N_FILL; i++) {
      if (h_fill_founds[i]) {
        fill_found++;
      } else {
        std::cout << "[DebugEviction] MISSING fill key=" << h_keys_fill[i]
                  << " (index=" << i << ")" << std::endl;
      }
    }
    std::cout << "[DebugEviction] Fill verify: found " << fill_found << "/"
              << N_FILL << " keys" << std::endl;
    delete[] h_fill_founds;
    CUDA_CHECK(cudaFree(d_fill_founds));
    CUDA_CHECK(cudaFree(d_fill_found_vals));
  }

  // Phase 2: Insert high-score keys.
  std::vector<K> h_keys_new(N_NEW);
  std::vector<V> h_values_new(N_NEW * DIM, 2.0f);
  std::vector<S> h_scores_new(N_NEW);

  for (size_t i = 0; i < N_NEW; i++) {
    h_keys_new[i] = N_FILL + 100 + i;
    h_scores_new[i] = 10000 + i;
  }

  std::cout << "[DebugEviction] Inserting " << N_NEW << " high-score keys "
            << "(scores " << h_scores_new[0] << ".." << h_scores_new[N_NEW - 1]
            << ")" << std::endl;

  CUDA_CHECK(cudaMemcpy(d_keys, h_keys_new.data(), N_NEW * sizeof(K),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values_new.data(), N_NEW * DIM * sizeof(V),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores_new.data(), N_NEW * sizeof(S),
                        cudaMemcpyHostToDevice));

  table.insert_or_assign(N_NEW, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  size_t table_size_after_evict = table.size(0);
  CUDA_CHECK(cudaDeviceSynchronize());
  std::cout << "[DebugEviction] After evict-insert: table_size="
            << table_size_after_evict << std::endl;

  // Phase 3: Find the high-score keys.
  bool* d_founds;
  CUDA_CHECK(cudaMalloc(&d_founds, N_NEW * sizeof(bool)));
  V* d_found_values;
  CUDA_CHECK(cudaMalloc(&d_found_values, N_NEW * DIM * sizeof(V)));

  CUDA_CHECK(cudaMemcpy(d_keys, h_keys_new.data(), N_NEW * sizeof(K),
                        cudaMemcpyHostToDevice));
  table.find(N_NEW, d_keys, d_found_values, d_founds, nullptr, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  bool* h_founds = new bool[N_NEW];
  CUDA_CHECK(cudaMemcpy(h_founds, d_founds, N_NEW * sizeof(bool),
                        cudaMemcpyDeviceToHost));
  int found_count = 0;
  for (size_t i = 0; i < N_NEW; i++) {
    std::cout << "[DebugEviction] key=" << h_keys_new[i]
              << " score=" << h_scores_new[i]
              << " found=" << (h_founds[i] ? "YES" : "NO") << std::endl;
    if (h_founds[i]) found_count++;
  }
  std::cout << "[DebugEviction] Found " << found_count << "/" << N_NEW
            << std::endl;

  EXPECT_EQ(found_count, static_cast<int>(N_NEW))
      << "All high-score keys should survive eviction in 2-bucket table";

  delete[] h_founds;
  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_founds));
  CUDA_CHECK(cudaFree(d_found_values));
}

// ===================================
// TestGroup 2 addition: Eviction Quality (T2.6)
// ===================================

// T2.6: After inserting 5x capacity keys with random scores, the surviving keys
// in the table should overlap with the theoretical top-capacity scores by at
// least 98%.  This validates that dual-bucket score-based eviction correctly
// retains high-score keys under sustained oversubscription pressure.
TEST(DualBucketTest, EvictionQualityAtFullLoad) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  constexpr size_t CAPACITY = 128 * 1024;      // 128K slots
  constexpr size_t TOTAL_KEYS = 5 * CAPACITY;  // 5x oversubscription
  constexpr size_t BATCH = CAPACITY;           // One capacity per batch
  constexpr double QUALITY_THRESHOLD = 0.995;

  Table table;
  create_memory_mode_table(table, CAPACITY);

  // Generate all keys (1..TOTAL_KEYS) with random scores.
  std::vector<K> all_keys(TOTAL_KEYS);
  std::vector<S> all_scores(TOTAL_KEYS);
  std::iota(all_keys.begin(), all_keys.end(), 1);

  std::mt19937_64 rng(42);
  for (size_t i = 0; i < TOTAL_KEYS; i++) {
    all_scores[i] = (rng() >> 1) | 1;  // Positive, non-zero
  }

  // Allocate device memory for one batch.
  K* d_keys;
  V* d_values;
  S* d_scores;
  CUDA_CHECK(cudaMalloc(&d_keys, BATCH * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, BATCH * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, BATCH * sizeof(S)));

  std::vector<V> h_values(BATCH * DIM, 1.0f);

  // Insert all keys in 5 batches.
  for (size_t offset = 0; offset < TOTAL_KEYS; offset += BATCH) {
    size_t n = std::min(BATCH, TOTAL_KEYS - offset);
    CUDA_CHECK(cudaMemcpy(d_keys, all_keys.data() + offset, n * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), n * DIM * sizeof(V),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, all_scores.data() + offset, n * sizeof(S),
                          cudaMemcpyHostToDevice));
    table.insert_or_assign(n, d_keys, d_values, d_scores, 0, true);
    CUDA_CHECK(cudaDeviceSynchronize());
  }

  // Export surviving keys and scores.
  size_t table_size = table.size(0);
  CUDA_CHECK(cudaDeviceSynchronize());

  K* d_dump_keys;
  V* d_dump_values;
  S* d_dump_scores;
  size_t* d_dump_counter;
  CUDA_CHECK(cudaMalloc(&d_dump_keys, table_size * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_dump_values, table_size * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_dump_scores, table_size * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));
  CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t)));

  table.export_batch(table_size, 0, d_dump_counter, d_dump_keys, d_dump_values,
                     d_dump_scores, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  size_t dumped;
  CUDA_CHECK(cudaMemcpy(&dumped, d_dump_counter, sizeof(size_t),
                        cudaMemcpyDeviceToHost));

  std::vector<K> h_dump_keys(dumped);
  CUDA_CHECK(cudaMemcpy(h_dump_keys.data(), d_dump_keys, dumped * sizeof(K),
                        cudaMemcpyDeviceToHost));

  // Compute the ideal top-`dumped` set: keys with the highest scores out of
  // all TOTAL_KEYS inserted during the entire test.
  std::vector<std::pair<S, K>> score_key_pairs(TOTAL_KEYS);
  for (size_t i = 0; i < TOTAL_KEYS; i++) {
    score_key_pairs[i] = {all_scores[i], all_keys[i]};
  }
  std::sort(score_key_pairs.begin(), score_key_pairs.end(),
            [](const auto& a, const auto& b) { return a.first > b.first; });

  std::unordered_set<K> ideal_set;
  for (size_t i = 0; i < dumped && i < TOTAL_KEYS; i++) {
    ideal_set.insert(score_key_pairs[i].second);
  }

  // Count overlap between surviving keys and ideal set.
  size_t overlap = 0;
  for (size_t i = 0; i < dumped; i++) {
    if (ideal_set.count(h_dump_keys[i])) overlap++;
  }

  double quality = static_cast<double>(overlap) / static_cast<double>(dumped);
  std::cout << "[EvictionQuality] Table size: " << dumped << "/" << CAPACITY
            << " (LF=" << (static_cast<double>(dumped) / CAPACITY) << ")"
            << std::endl;
  std::cout << "[EvictionQuality] Overlap with ideal top-" << dumped << ": "
            << overlap << "/" << dumped << " (quality=" << (quality * 100.0)
            << "%)" << std::endl;

  EXPECT_GE(quality, QUALITY_THRESHOLD)
      << "Eviction quality " << (quality * 100.0) << "% is below "
      << (QUALITY_THRESHOLD * 100.0) << "% threshold";

  // Cleanup.
  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_dump_keys));
  CUDA_CHECK(cudaFree(d_dump_values));
  CUDA_CHECK(cudaFree(d_dump_scores));
  CUDA_CHECK(cudaFree(d_dump_counter));
}

// ===================================
// TestGroup 6: Concurrency Stress Tests
// ===================================

// T6.1: Multi-stream concurrent upsert stress test.
// Multiple CUDA streams issue insert_or_assign concurrently to stress Phase 2
// eviction's stale-score handling.  Under high contention some inserts may be
// REFUSED, but the table must remain consistent: no crashes, no duplicates,
// and all surviving keys must be findable.
TEST(DualBucketTest, MultiStreamConcurrentUpsert) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  constexpr size_t CAPACITY = 128 * 1024;    // 128K slots
  constexpr int NUM_STREAMS = 4;
  constexpr size_t KEYS_PER_STREAM = CAPACITY;  // Each stream fills capacity
  constexpr size_t TOTAL_KEYS = NUM_STREAMS * KEYS_PER_STREAM;

  Table table;
  create_memory_mode_table(table, CAPACITY);

  // Create CUDA streams.
  cudaStream_t streams[NUM_STREAMS];
  for (int s = 0; s < NUM_STREAMS; s++) {
    CUDA_CHECK(cudaStreamCreate(&streams[s]));
  }

  // Prepare per-stream device memory and data.
  K* d_keys[NUM_STREAMS];
  V* d_values[NUM_STREAMS];
  S* d_scores[NUM_STREAMS];

  std::mt19937_64 rng(42);
  for (int s = 0; s < NUM_STREAMS; s++) {
    CUDA_CHECK(cudaMalloc(&d_keys[s], KEYS_PER_STREAM * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_values[s], KEYS_PER_STREAM * DIM * sizeof(V)));
    CUDA_CHECK(cudaMalloc(&d_scores[s], KEYS_PER_STREAM * sizeof(S)));

    std::vector<K> h_keys(KEYS_PER_STREAM);
    std::vector<V> h_values(KEYS_PER_STREAM * DIM, static_cast<V>(s + 1));
    std::vector<S> h_scores(KEYS_PER_STREAM);

    for (size_t i = 0; i < KEYS_PER_STREAM; i++) {
      // Use non-overlapping key ranges per stream.
      h_keys[i] = s * KEYS_PER_STREAM + i + 1;
      h_scores[i] = (rng() >> 1) | 1;  // Random positive score
    }

    CUDA_CHECK(cudaMemcpy(d_keys[s], h_keys.data(),
                          KEYS_PER_STREAM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_values[s], h_values.data(),
                          KEYS_PER_STREAM * DIM * sizeof(V),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores[s], h_scores.data(),
                          KEYS_PER_STREAM * sizeof(S),
                          cudaMemcpyHostToDevice));
  }

  // Launch concurrent inserts on all streams simultaneously.
  for (int s = 0; s < NUM_STREAMS; s++) {
    table.insert_or_assign(KEYS_PER_STREAM, d_keys[s], d_values[s],
                           d_scores[s], streams[s], /*unique_key=*/true);
  }

  // Synchronize all streams.
  for (int s = 0; s < NUM_STREAMS; s++) {
    CUDA_CHECK(cudaStreamSynchronize(streams[s]));
  }

  // Verify table consistency.
  size_t table_size = table.size(0);
  CUDA_CHECK(cudaDeviceSynchronize());
  std::cout << "[MultiStream] Table size after concurrent inserts: "
            << table_size << "/" << CAPACITY << std::endl;

  // Table size must not exceed capacity (no overflow).
  EXPECT_LE(table_size, CAPACITY);
  // Some keys should have been inserted (table should not be empty).
  EXPECT_GT(table_size, static_cast<size_t>(0));

  // Export all surviving keys and verify they are findable.
  K* d_dump_keys;
  V* d_dump_values;
  S* d_dump_scores;
  size_t* d_dump_counter;
  CUDA_CHECK(cudaMalloc(&d_dump_keys, table_size * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_dump_values, table_size * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_dump_scores, table_size * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));
  CUDA_CHECK(cudaMemset(d_dump_counter, 0, sizeof(size_t)));

  table.export_batch(table_size, 0, d_dump_counter, d_dump_keys, d_dump_values,
                     d_dump_scores, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  size_t dumped;
  CUDA_CHECK(cudaMemcpy(&dumped, d_dump_counter, sizeof(size_t),
                        cudaMemcpyDeviceToHost));
  EXPECT_EQ(dumped, table_size);

  // Find all exported keys — every surviving key must be findable.
  bool* d_founds;
  V* d_found_values;
  CUDA_CHECK(cudaMalloc(&d_founds, dumped * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found_values, dumped * DIM * sizeof(V)));

  table.find(dumped, d_dump_keys, d_found_values, d_founds, nullptr, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  bool* h_founds = new bool[dumped];
  CUDA_CHECK(cudaMemcpy(h_founds, d_founds, dumped * sizeof(bool),
                        cudaMemcpyDeviceToHost));

  int found_count = 0;
  for (size_t i = 0; i < dumped; i++) {
    if (h_founds[i]) found_count++;
  }
  std::cout << "[MultiStream] Find consistency: " << found_count << "/"
            << dumped << std::endl;
  EXPECT_EQ(found_count, static_cast<int>(dumped))
      << "All surviving keys must be findable after concurrent upserts";

  // Check no duplicates: export size must match table.size().
  std::vector<K> h_dump_keys(dumped);
  CUDA_CHECK(cudaMemcpy(h_dump_keys.data(), d_dump_keys, dumped * sizeof(K),
                        cudaMemcpyDeviceToHost));
  std::unordered_set<K> unique_keys(h_dump_keys.begin(), h_dump_keys.end());
  EXPECT_EQ(unique_keys.size(), dumped) << "Duplicate keys found in table";

  // Cleanup.
  delete[] h_founds;
  for (int s = 0; s < NUM_STREAMS; s++) {
    CUDA_CHECK(cudaFree(d_keys[s]));
    CUDA_CHECK(cudaFree(d_values[s]));
    CUDA_CHECK(cudaFree(d_scores[s]));
    CUDA_CHECK(cudaStreamDestroy(streams[s]));
  }
  CUDA_CHECK(cudaFree(d_dump_keys));
  CUDA_CHECK(cudaFree(d_dump_values));
  CUDA_CHECK(cudaFree(d_dump_scores));
  CUDA_CHECK(cudaFree(d_dump_counter));
  CUDA_CHECK(cudaFree(d_founds));
  CUDA_CHECK(cudaFree(d_found_values));
}

// T6.2: Multi-stream concurrent upsert with overlapping keys.
// Tests that concurrent streams inserting the same keys do not create
// duplicates, and that the final values/scores are consistent.
TEST(DualBucketTest, MultiStreamOverlappingKeys) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  constexpr size_t CAPACITY = 128 * 1024;
  constexpr int NUM_STREAMS = 4;
  constexpr size_t N = 32768;  // Shared key set

  Table table;
  create_memory_mode_table(table, CAPACITY);

  cudaStream_t streams[NUM_STREAMS];
  for (int s = 0; s < NUM_STREAMS; s++) {
    CUDA_CHECK(cudaStreamCreate(&streams[s]));
  }

  // All streams insert the SAME keys with different scores.
  std::vector<K> h_keys(N);
  std::iota(h_keys.begin(), h_keys.end(), 1);

  K* d_keys[NUM_STREAMS];
  V* d_values[NUM_STREAMS];
  S* d_scores[NUM_STREAMS];

  for (int s = 0; s < NUM_STREAMS; s++) {
    CUDA_CHECK(cudaMalloc(&d_keys[s], N * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_values[s], N * DIM * sizeof(V)));
    CUDA_CHECK(cudaMalloc(&d_scores[s], N * sizeof(S)));

    std::vector<V> h_values(N * DIM, static_cast<V>(s + 1));
    std::vector<S> h_scores(N);
    for (size_t i = 0; i < N; i++) {
      h_scores[i] = (s + 1) * 1000 + i;  // Different scores per stream
    }

    CUDA_CHECK(cudaMemcpy(d_keys[s], h_keys.data(), N * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_values[s], h_values.data(), N * DIM * sizeof(V),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores[s], h_scores.data(), N * sizeof(S),
                          cudaMemcpyHostToDevice));
  }

  // Launch concurrent inserts.
  for (int s = 0; s < NUM_STREAMS; s++) {
    table.insert_or_assign(N, d_keys[s], d_values[s], d_scores[s], streams[s],
                           true);
  }

  for (int s = 0; s < NUM_STREAMS; s++) {
    CUDA_CHECK(cudaStreamSynchronize(streams[s]));
  }

  // Table size must equal N (no duplicates from concurrent inserts).
  size_t table_size = table.size(0);
  CUDA_CHECK(cudaDeviceSynchronize());
  std::cout << "[MultiStreamOverlap] Table size: " << table_size
            << " (expected " << N << ")" << std::endl;
  EXPECT_EQ(table_size, N) << "Concurrent inserts of same keys created "
                           << (table_size - N) << " duplicates";

  // All keys must be findable.
  bool* d_founds;
  V* d_found_values;
  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));

  table.find(N, d_keys[0], d_found_values, d_founds, nullptr, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  bool* h_founds = new bool[N];
  CUDA_CHECK(
      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));

  int found_count = 0;
  for (size_t i = 0; i < N; i++) {
    if (h_founds[i]) found_count++;
  }
  EXPECT_EQ(found_count, static_cast<int>(N))
      << "All keys must be findable after concurrent overlapping inserts";

  // Cleanup.
  delete[] h_founds;
  for (int s = 0; s < NUM_STREAMS; s++) {
    CUDA_CHECK(cudaFree(d_keys[s]));
    CUDA_CHECK(cudaFree(d_values[s]));
    CUDA_CHECK(cudaFree(d_scores[s]));
    CUDA_CHECK(cudaStreamDestroy(streams[s]));
  }
  CUDA_CHECK(cudaFree(d_founds));
  CUDA_CHECK(cudaFree(d_found_values));
}

// ===================================
// TestGroup 7: Additional Missing Tests
// ===================================

// T7.1: Find with scores=nullptr (CopyScoreEmpty path).
TEST(DualBucketTest, FindWithNullScores) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  constexpr size_t CAPACITY = 128 * 128;
  constexpr size_t N = 1024;

  Table table;
  create_memory_mode_table(table, CAPACITY);

  std::vector<K> h_keys(N);
  std::vector<V> h_values(N * DIM, 1.0f);
  std::vector<S> h_scores(N);

  std::iota(h_keys.begin(), h_keys.end(), 1);
  for (size_t i = 0; i < N; i++) h_scores[i] = i + 1;

  K* d_keys;
  V* d_values;
  S* d_scores;
  bool* d_founds;
  V* d_found_values;
  S* d_found_scores;
  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_found_scores, N * sizeof(S)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),
                        cudaMemcpyHostToDevice));

  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  // Find with scores=nullptr (CopyScoreEmpty branch).
  table.find(N, d_keys, d_found_values, d_founds, /*scores=*/nullptr, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  bool* h_founds = new bool[N];
  CUDA_CHECK(
      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));
  for (size_t i = 0; i < N; i++) {
    EXPECT_TRUE(h_founds[i]) << "Key " << h_keys[i] << " not found (null scores)";
  }

  // Find with scores!=nullptr (CopyScoreByPassCache branch).
  table.find(N, d_keys, d_found_values, d_founds, d_found_scores, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  CUDA_CHECK(
      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));
  std::vector<S> h_found_scores(N);
  CUDA_CHECK(cudaMemcpy(h_found_scores.data(), d_found_scores, N * sizeof(S),
                        cudaMemcpyDeviceToHost));
  for (size_t i = 0; i < N; i++) {
    EXPECT_TRUE(h_founds[i])
        << "Key " << h_keys[i] << " not found (with scores)";
    if (h_founds[i]) {
      EXPECT_GT(h_found_scores[i], static_cast<S>(0))
          << "Score should be non-zero for key " << h_keys[i];
    }
  }

  delete[] h_founds;
  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_founds));
  CUDA_CHECK(cudaFree(d_found_values));
  CUDA_CHECK(cudaFree(d_found_scores));
}

// T7.2: Clear then re-insert (verifies dual_bucket_empty_digest reset).
TEST(DualBucketTest, ClearAndReinsert) {
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  constexpr size_t CAPACITY = 128 * 128;
  constexpr size_t N = 2048;

  Table table;
  create_memory_mode_table(table, CAPACITY);

  std::vector<K> h_keys(N);
  std::vector<V> h_values(N * DIM, 1.0f);
  std::vector<S> h_scores(N);

  std::iota(h_keys.begin(), h_keys.end(), 1);
  for (size_t i = 0; i < N; i++) h_scores[i] = i + 1;

  K* d_keys;
  V* d_values;
  S* d_scores;
  bool* d_founds;
  V* d_found_values;
  CUDA_CHECK(cudaMalloc(&d_keys, N * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_values, N * DIM * sizeof(V)));
  CUDA_CHECK(cudaMalloc(&d_scores, N * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_founds, N * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_found_values, N * DIM * sizeof(V)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_values, h_values.data(), N * DIM * sizeof(V),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores.data(), N * sizeof(S),
                        cudaMemcpyHostToDevice));

  // Insert first batch.
  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());
  EXPECT_EQ(table.size(0), N);

  // Clear the table.
  table.clear(0);
  CUDA_CHECK(cudaDeviceSynchronize());
  EXPECT_EQ(table.size(0), static_cast<size_t>(0));

  // Re-insert different keys.
  std::vector<K> h_keys2(N);
  std::iota(h_keys2.begin(), h_keys2.end(), N + 1);  // Different keys
  CUDA_CHECK(cudaMemcpy(d_keys, h_keys2.data(), N * sizeof(K),
                        cudaMemcpyHostToDevice));

  table.insert_or_assign(N, d_keys, d_values, d_scores, 0, true);
  CUDA_CHECK(cudaDeviceSynchronize());

  size_t table_size = table.size(0);
  CUDA_CHECK(cudaDeviceSynchronize());
  EXPECT_EQ(table_size, N)
      << "After clear + re-insert, table should have N entries";

  // Verify new keys are findable.
  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  bool* h_founds = new bool[N];
  CUDA_CHECK(
      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));
  for (size_t i = 0; i < N; i++) {
    EXPECT_TRUE(h_founds[i])
        << "Key " << h_keys2[i] << " not found after clear + re-insert";
  }

  // Verify old keys are NOT findable.
  CUDA_CHECK(cudaMemcpy(d_keys, h_keys.data(), N * sizeof(K),
                        cudaMemcpyHostToDevice));
  table.find(N, d_keys, d_found_values, d_founds, nullptr, 0);
  CUDA_CHECK(cudaDeviceSynchronize());

  CUDA_CHECK(
      cudaMemcpy(h_founds, d_founds, N * sizeof(bool), cudaMemcpyDeviceToHost));
  for (size_t i = 0; i < N; i++) {
    EXPECT_FALSE(h_founds[i])
        << "Old key " << h_keys[i] << " still found after clear";
  }

  delete[] h_founds;
  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_values));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_founds));
  CUDA_CHECK(cudaFree(d_found_values));
}


================================================
FILE: tests/dynamic_max_capacity_test.cc.cu
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <gtest/gtest.h>
#include <stdio.h>
#include <array>
#include <map>
#include "merlin/types.cuh"
#include "merlin_hashtable.cuh"
#include "merlin_localfile.hpp"
#include "test_util.cuh"

constexpr size_t dim = 64;
using i64 = int64_t;
using u64 = uint64_t;
using f32 = float;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;

/*
 * test_dynamic_max_capcity_table creates a table in small
 * capacity and insert random kv pairs until its load_factor
 * became 1.0. Then expand the max_capacity. Keep inserting until
 * the load factor growth to 1.0 again.
 */
void test_dynamic_max_capcity_table() {
  size_t len = 10000llu;
  size_t max_capacity = 1 << 14;
  size_t init_capacity = 1 << 12;
  size_t offset = 0;
  size_t uplimit = 1 << 20;
  float load_factor_threshold = 0.98f;

  TableOptions opt;
  opt.max_capacity = max_capacity;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = uplimit * dim * sizeof(f32);
  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kLru>;
  opt.dim = dim;

  using Vec_t = test_util::ValueArray<f32, dim>;
  std::map<i64, Vec_t> ref_map;
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> buffer;
  buffer.Reserve(len, dim, stream);
  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;
  evict_buffer.Reserve(len, dim, stream);

  size_t total_len = 0;
  while (true) {
    buffer.ToRange(offset, /*skip=1*/ 1, stream);
    size_t n_evicted = table->insert_and_evict(
        len, buffer.keys_ptr(), buffer.values_ptr(), nullptr,
        evict_buffer.keys_ptr(), evict_buffer.values_ptr(), nullptr, stream);
    printf("Insert %zu keys and evict %zu\n", len, n_evicted);
    offset += len;
    total_len += len;
    evict_buffer.SyncData(/*h2d=*/false, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    for (size_t i = 0; i < n_evicted; i++) {
      Vec_t* vec =
          reinterpret_cast<Vec_t*>(evict_buffer.values_ptr(false) + i * dim);
      ref_map[evict_buffer.keys_ptr(false)[i]] = *vec;
    }

    if (table->load_factor(stream) >= load_factor_threshold) {
      ASSERT_GE(table->size(stream),
                static_cast<size_t>((static_cast<float>(max_capacity) *
                                     load_factor_threshold)));
      max_capacity *= 2;
      if (max_capacity > uplimit) {
        break;
      }
      // What we need.
      printf("----> check change max_capacity from %zu to %zu\n",
             table->capacity(), max_capacity);
      table->set_max_capacity(max_capacity);
      table->reserve(max_capacity, stream);
      ASSERT_EQ(max_capacity, table->capacity());
      ASSERT_LE(table->load_factor(stream), 0.5f);
    }

    if (total_len > uplimit * 2) {
      throw std::runtime_error("Traverse too much keys but not finish test.");
    }
  };

  offset = 0;
  for (; offset < table->capacity(); offset += len) {
    size_t search_len = len;
    if (offset + search_len > table->capacity()) {
      search_len = table->capacity() - offset;
    }
    size_t n_exported =
        table->export_batch(search_len, offset, buffer.keys_ptr(),
                            buffer.values_ptr(), /*scores=*/nullptr, stream);
    buffer.SyncData(/*h2d=*/false);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    for (size_t i = 0; i < n_exported; i++) {
      Vec_t* vec = reinterpret_cast<Vec_t*>(buffer.values_ptr(false) + i * dim);
      for (size_t j = 0; j < dim; j++) {
        ASSERT_EQ(buffer.keys_ptr(false)[i], vec->operator[](j));
      }
      ref_map[buffer.keys_ptr(false)[i]] = *vec;
    }
  }

  printf("---> uplimit: %zu\n", uplimit);
  printf("---> table size: %zu\n", table->size(stream));
  printf("---> table cap: %zu\n", table->capacity());
  printf("---> cpu table size: %zu\n", ref_map.size());
  for (auto& it : ref_map) {
    for (size_t j = 0; j < dim; j++) {
      ASSERT_EQ(static_cast<f32>(it.first), it.second.data[j]);
    }
  }
  ASSERT_EQ(table->capacity() * 2, max_capacity);
  ASSERT_GE(static_cast<float>(ref_map.size()),
            static_cast<float>(table->capacity()) * load_factor_threshold);
}

TEST(MerlinHashTableTest, test_dynamic_max_capcity_table) {
  test_dynamic_max_capcity_table();
}


================================================
FILE: tests/export_batch_if_test.cc.cu
================================================
#include <cooperative_groups.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <array>
#include <iostream>
#include <map>
#include <thread>
#include <unordered_map>
#include <vector>
#include "merlin/types.cuh"
#include "merlin_hashtable.cuh"
#include "test_util.cuh"

namespace cg = cooperative_groups;

using i64 = int64_t;
using u64 = uint64_t;
using f32 = float;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;
using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kCustomized>;

template <class K, class S>
struct ExportIfPredFunctor {
  __forceinline__ __device__ bool operator()(const K& key, S& score,
                                             const K& pattern,
                                             const S& threshold) {
    return score < threshold;
  }
};

template <class K, class V, class S>
struct ExportIfPredFunctorV2 {
  K pattern;
  S threshold;
  ExportIfPredFunctorV2(K pattern, S threshold)
      : pattern(pattern), threshold(threshold) {}
  template <int GroupSize>
  __forceinline__ __device__ bool operator()(
      const K& key, const V* value, const S& score,
      cg::thread_block_tile<GroupSize>& g) {
    /* evaluate key, score and value. */
    return score < threshold;
  }
};

template <class K, class V, class S>
struct ExportIfPredFunctorV3 {
  K pattern;
  S threshold;
  int dim;
  ExportIfPredFunctorV3(K pattern, S threshold)
      : pattern(pattern), threshold(threshold) {}
  template <int GroupSize>
  __forceinline__ __device__ bool operator()(
      const K& key, const V* value, const S& score,
      cg::thread_block_tile<GroupSize>& g) {
    /* evaluate key, score and value. */
    bool pred = score < threshold;

    for (int i = 0; i < g.size(); i++) {
      auto cur_value = g.shfl(value, i);
      auto cur_key = g.shfl(key, i);
      bool cur_pred = g.shfl(pred, i);
      if (cur_pred == false) continue;
      unsigned int vote = 0;
      /* evaluate one value cooperatively in one loop. */
      for (int j = g.thread_rank(); j < dim; j += g.size()) {
        if (cur_value[j] != cur_key) cur_pred = false;
        vote = g.ballot(cur_pred == false);
        if (vote != 0) break;
      }
      if (g.thread_rank() == i && vote != 0) pred = false;
    }
    return pred;
  }
};

// Using for_each API to simulate export_batch_if_v2 API.
template <class K, class V, class S>
struct ForEachExecutionFuncV4 {
  K pattern;
  S threshold;
  int dim;
  uint64_t* d_counter;
  K* out_keys;
  V* out_vals;
  S* out_scores;
  ForEachExecutionFuncV4(K pattern, S threshold)
      : pattern(pattern), threshold(threshold) {}
  template <int GroupSize>
  __forceinline__ __device__ void operator()(
      const K& key, V* value, S* score, cg::thread_block_tile<GroupSize>& g) {
    S score_val = *score;
    bool match = score_val < threshold;
    uint32_t vote = g.ballot(match);
    int group_cnt = __popc(vote);
    uint64_t group_offset = 0;
    if (g.thread_rank() == 0) {
      group_offset = atomicAdd(d_counter, static_cast<uint64_t>(group_cnt));
    }
    group_offset = g.shfl(group_offset, 0);
    int previous_cnt = group_cnt - __popc(vote >> g.thread_rank());
    if (match) {
      out_keys[group_offset + previous_cnt] = key;
      if (out_scores) {
        out_scores[group_offset + previous_cnt] = score_val;
      }
    }
    for (int r = 0; r < GroupSize; r++) {
      uint32_t biased_vote = vote >> r;
      bool cur_match = biased_vote & 1;
      if (cur_match) {
        int bias = group_cnt - __popc(biased_vote);
        V* cur_vals = g.shfl(value, r);
        for (int j = g.thread_rank(); j < dim; j += GroupSize) {
          out_vals[(group_offset + bias) * dim + j] = cur_vals[j];
        }
      }
    }
  }
};

enum class ExportIfVersion { V1, V2, V3, V4 };

template <ExportIfVersion EV>
void test_export_batch_if_with_limited_size() {
  constexpr uint64_t CAP = 1llu << 24;
  size_t n0 = (1llu << 23) - 163;
  size_t n1 = (1llu << 23) + 221;
  size_t n2 = (1llu << 23) - 17;
  size_t dim = 64;
  size_t table_size = 0;
  i64 pattern = 0;
  u64 threshold = 40;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  TableOptions options;
  options.init_capacity = CAP;
  options.max_capacity = CAP;
  options.dim = dim;
  options.max_hbm_for_vectors = nv::merlin::GB(100);

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  size_t* d_cnt = nullptr;
  CUDA_CHECK(cudaMallocAsync(&d_cnt, sizeof(size_t), stream));
  CUDA_CHECK(cudaMemsetAsync(d_cnt, 0, sizeof(size_t), stream));

  test_util::KVMSBuffer<i64, f32, u64> buffer0;
  buffer0.Reserve(n0, dim, stream);
  buffer0.ToRange(0, 1, stream);
  buffer0.Setscore((u64)15, stream);
  {
    test_util::KVMSBuffer<i64, f32, u64> buffer0_ev;
    buffer0_ev.Reserve(n0, dim, stream);
    buffer0_ev.ToZeros(stream);
    // table->insert_or_assign(n0, buffer0.keys_ptr(), buffer0.values_ptr(),
    //                         buffer0.scores_ptr(), stream, true, false);
    table->insert_and_evict(n0, buffer0.keys_ptr(), buffer0.values_ptr(),
                            buffer0.scores_ptr(), buffer0_ev.keys_ptr(),
                            buffer0_ev.values_ptr(), buffer0_ev.scores_ptr(),
                            d_cnt, stream, true, false);
    table_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    MERLIN_EXPECT_TRUE(table_size == n0, "Invalid table size.");
  }

  test_util::KVMSBuffer<i64, f32, u64> buffer1;
  buffer1.Reserve(n1, dim, stream);
  buffer1.ToRange(n0, 1, stream);
  buffer1.Setscore((u64)30, stream);
  {
    test_util::KVMSBuffer<i64, f32, u64> buffer1_ev;
    buffer1_ev.Reserve(n0, dim, stream);
    buffer1_ev.ToZeros(stream);
    // table->insert_or_assign(n1, buffer1.keys_ptr(), buffer1.values_ptr(),
    //                         buffer1.scores_ptr(), stream, true, false);
    table->insert_and_evict(n0, buffer1.keys_ptr(), buffer1.values_ptr(),
                            buffer1.scores_ptr(), buffer1_ev.keys_ptr(),
                            buffer1_ev.values_ptr(), buffer1_ev.scores_ptr(),
                            d_cnt, stream, true, false);
    table_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
  }

  test_util::KVMSBuffer<i64, f32, u64> buffer2;
  buffer2.Reserve(n2, dim, stream);
  buffer2.ToRange(n0 + n1, 1, stream);
  buffer2.Setscore((u64)45, stream);
  {
    test_util::KVMSBuffer<i64, f32, u64> buffer2_ev;
    buffer2_ev.Reserve(n0, dim, stream);
    buffer2_ev.ToZeros(stream);
    // table->insert_or_assign(n2, buffer2.keys_ptr(), buffer2.values_ptr(),
    //                         buffer2.scores_ptr(), stream, true, false);
    table->insert_and_evict(n0, buffer2.keys_ptr(), buffer2.values_ptr(),
                            buffer2.scores_ptr(), buffer2_ev.keys_ptr(),
                            buffer2_ev.values_ptr(), buffer2_ev.scores_ptr(),
                            d_cnt, stream, true, false);
    table_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    printf("final size: %zu, capacity: %zu\n", table_size, table->capacity());
  }

  size_t h_cnt = 0;
  size_t h_cnt2 = 0;

  table->size_if<ExportIfPredFunctor>(pattern, threshold, d_cnt, stream);
  CUDA_CHECK(cudaMemcpyAsync(&h_cnt, d_cnt, sizeof(size_t),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  printf("---> check h_cnt from size_if kernel: %zu\n", h_cnt);

  test_util::KVMSBuffer<i64, f32, u64> buffer_out;
  buffer_out.Reserve(h_cnt, dim, stream);
  buffer_out.ToZeros(stream);

  CUDA_CHECK(cudaMemsetAsync(d_cnt, 0, sizeof(size_t), stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  bool use_pin = false;

  uint64_t t0 = test_util::getTimestamp();
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start);
  if (EV == ExportIfVersion::V1) {
    table->export_batch_if<ExportIfPredFunctor>(
        pattern, threshold, static_cast<size_t>(CAP), 0, d_cnt,
        buffer_out.keys_ptr(!use_pin), buffer_out.values_ptr(!use_pin),
        buffer_out.scores_ptr(!use_pin), stream);
  } else if (EV == ExportIfVersion::V2) {
    ExportIfPredFunctorV2<i64, f32, u64> pred(pattern, threshold);
    table->export_batch_if_v2<ExportIfPredFunctorV2<i64, f32, u64>>(
        pred, static_cast<size_t>(CAP), 0, d_cnt, buffer_out.keys_ptr(!use_pin),
        buffer_out.values_ptr(!use_pin), buffer_out.scores_ptr(!use_pin),
        stream);
  } else if (EV == ExportIfVersion::V3) {
    ExportIfPredFunctorV3<i64, f32, u64> pred(pattern, threshold);
    pred.dim = dim;
    table->export_batch_if_v2<ExportIfPredFunctorV3<i64, f32, u64>>(
        pred, static_cast<size_t>(CAP), 0, d_cnt, buffer_out.keys_ptr(!use_pin),
        buffer_out.values_ptr(!use_pin), buffer_out.scores_ptr(!use_pin),
        stream);
  } else if (EV == ExportIfVersion::V4) {
    ForEachExecutionFuncV4<i64, f32, u64> f(pattern, threshold);
    f.dim = dim;
    f.d_counter = d_cnt;
    f.out_keys = buffer_out.keys_ptr(!use_pin);
    f.out_vals = buffer_out.values_ptr(!use_pin);
    f.out_scores = buffer_out.scores_ptr(!use_pin);
    table->for_each<ForEachExecutionFuncV4<i64, f32, u64>>(
        0, static_cast<size_t>(CAP), f, stream);
  }
  cudaEventRecord(stop);
  CUDA_CHECK(cudaMemcpyAsync(&h_cnt2, d_cnt, sizeof(size_t),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  printf("final h_cnt2: %zu\n", h_cnt2);

  MERLIN_EXPECT_TRUE(
      h_cnt == h_cnt2,
      "size_if and export_batch_if get different matching count.");
  float cu_cost = 0;
  cudaEventElapsedTime(&cu_cost, start, stop);
  uint64_t t1 = test_util::getTimestamp();
  printf("final h_cnt2: %zu, cost: %zu, cu_cost: %f\n", h_cnt2, t1 - t0,
         cu_cost);

  if (!use_pin) {
    buffer_out.SyncData(false, stream);
  }
  CUDA_CHECK(cudaStreamSynchronize(stream));
  uint64_t t2 = test_util::getTimestamp();
  printf("use_pin: %d. After sycn data of len: %zu, total cost: %zu\n", use_pin,
         h_cnt2, t2 - t0);

  std::unordered_map<i64, u64> record;
  for (size_t i = 0; i < h_cnt; i++) {
    i64 key = buffer_out.keys_ptr(false)[i];
    u64 score = buffer_out.scores_ptr(false)[i];
    MERLIN_EXPECT_TRUE(score < threshold, "");
    record[key] = score;
    for (int j = 0; j < dim; j++) {
      f32 value = buffer_out.values_ptr(false)[i * dim + j];
      MERLIN_EXPECT_TRUE(key == static_cast<i64>(value), "");
    }
  }
  MERLIN_EXPECT_TRUE(record.size() == h_cnt2, "");
  printf("record: %zu\n", record.size());
  printf("n0+n1: %zu\n", n0 + n1);
  printf("n0+n1+n2: %zu\n", n0 + n1 + n2);
  printf("done\n");
}

int main() {
  test_export_batch_if_with_limited_size<ExportIfVersion::V1>();
  test_export_batch_if_with_limited_size<ExportIfVersion::V2>();
  test_export_batch_if_with_limited_size<ExportIfVersion::V3>();
  test_export_batch_if_with_limited_size<ExportIfVersion::V4>();
  return 0;
}


================================================
FILE: tests/find_or_insert_ptr_lock_test.cc.cu
================================================
/*
 * Copyright (c) 2025, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * test APIs: find_or_insert_ptr, unlock_keys
 */

#include <gtest/gtest.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <array>
#include <iostream>
#include <thread>
#include <vector>
#include "merlin_hashtable.cuh"
#include "test_util.cuh"

constexpr size_t DIM = 16;
using K = uint64_t;
using V = float;
using S = uint64_t;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;

template <class K, class S>
struct EraseIfPredFunctor {
  __forceinline__ __device__ bool operator()(const K& key, S& score,
                                             const K& pattern,
                                             const S& threshold) {
    return ((key & 0x7f > pattern) && (score > threshold));
  }
};

template <class K, class S>
struct ExportIfPredFunctor {
  __forceinline__ __device__ bool operator()(const K& key, S& score,
                                             const K& pattern,
                                             const S& threshold) {
    return score > threshold;
  }
};

template <typename T>
void find_or_insert_safe_ptr(T* table, uint64_t KEY_NUM, K* d_keys, S* d_scores,
                             V* d_vectors, uint64_t dim, cudaStream_t& stream) {
  V** d_vectors_ptr = nullptr;
  bool* d_found;
  K** d_key_ptrs = nullptr;
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMalloc(&d_key_ptrs, KEY_NUM * sizeof(K*)));

  table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,
                        stream, true, false, d_key_ptrs);
  test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found, dim, KEY_NUM,
                               stream);
  /// TODO:check the d_found
  table->unlock_keys(KEY_NUM, d_key_ptrs, d_keys, d_found, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  CUDA_CHECK(cudaFree(d_key_ptrs));
  CUDA_CHECK(cudaFree(d_vectors_ptr));
  CUDA_CHECK(cudaFree(d_found));
}

void test_basic(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_new_vectors;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, d_scores, d_vectors,
                            options.dim, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }

    int found_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));
    table->assign(KEY_NUM, reinterpret_cast<const K*>(d_keys),
                  reinterpret_cast<const float*>(d_new_vectors),
                  reinterpret_cast<const S*>(d_scores), stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(d_new_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      test_util::array2ptr(d_vectors_ptr, d_new_vectors, options.dim, KEY_NUM,
                           stream);
      table->find(KEY_NUM, d_keys, d_new_vectors, d_found, d_scores, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_new_vectors, options.dim,
                               KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_new_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    found_num = 0;
    uint32_t i_value = 0x2020202;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  *(reinterpret_cast<float*>(&i_value)));
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,
                           stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    table->erase(KEY_NUM >> 1, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, total_size >> 1);

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, d_scores, d_vectors,
                            options.dim, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    found_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_keys, 0, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                       d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    ASSERT_EQ(dump_counter, KEY_NUM);
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_new_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_basic_when_full(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, nullptr,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_def_val;
  V** d_vectors_ptr;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, d_scores, d_vectors,
                            options.dim, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_insert = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    table->erase(KEY_NUM, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, 0);

    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, d_scores, d_vectors,
                            options.dim, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_reinsert = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_insert, total_size_after_reinsert);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_def_val));
  CUDA_CHECK(cudaFree(d_vectors_ptr));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_erase_if_pred(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 256UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 128UL;
  constexpr uint64_t TEST_TIMES = 1;
  constexpr uint64_t BUCKET_MAX_SIZE = 128;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    test_util::create_keys_in_one_buckets<K, S, V, DIM>(
        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, d_scores, d_vectors,
                            options.dim, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

    K pattern = 100;
    S threshold = 0;
    size_t erase_num = table->template erase_if<EraseIfPredFunctor>(
        pattern, threshold, stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ((erase_num + total_size), BUCKET_MAX_SIZE);

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        ASSERT_EQ(h_scores[i], h_keys[i]);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, (BUCKET_MAX_SIZE - erase_num));

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_rehash(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
  constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE;
  constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = BUCKET_MAX_SIZE * 2;
  constexpr uint64_t TEST_TIMES = 100;
  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    test_util::create_keys_in_one_buckets<K, S, V, DIM>(
        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, d_scores, d_vectors,
                            options.dim, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaDeviceSynchronize());
    ASSERT_EQ(total_size, KEY_NUM);

    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                       d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(dump_counter, KEY_NUM);

    table->reserve(MAX_CAPACITY, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(table->capacity(), MAX_CAPACITY);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BUCKET_MAX_SIZE * sizeof(V*)));
      table->find(BUCKET_MAX_SIZE, d_keys, d_vectors_ptr, d_found, d_scores,
                  stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim,
                               BUCKET_MAX_SIZE, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {
      if (h_found[i]) {
        found_num++;
        ASSERT_EQ(h_scores[i], h_keys[i]);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, BUCKET_MAX_SIZE);

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_rehash_on_big_batch(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 1024;
  constexpr uint64_t MAX_CAPACITY = 16 * 1024;
  constexpr uint64_t INIT_KEY_NUM = 1024;
  constexpr uint64_t KEY_NUM = 2048;
  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = 128;
  options.max_load_factor = 0.6;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  uint64_t expected_size = 0;
  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  total_size = table->size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(total_size, 0);

  find_or_insert_safe_ptr(table.get(), INIT_KEY_NUM, d_keys, d_scores,
                          d_vectors, options.dim, stream);

  CUDA_CHECK(cudaStreamSynchronize(stream));
  expected_size = INIT_KEY_NUM;

  total_size = table->size(stream);
  CUDA_CHECK(cudaDeviceSynchronize());
  ASSERT_EQ(total_size, expected_size);
  ASSERT_EQ(table->capacity(), (INIT_CAPACITY * 2));

  find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, d_scores, d_vectors,
                          options.dim, stream);

  CUDA_CHECK(cudaStreamSynchronize(stream));
  expected_size = KEY_NUM;

  total_size = table->size(stream);
  CUDA_CHECK(cudaDeviceSynchronize());
  ASSERT_EQ(total_size, expected_size);
  ASSERT_EQ(table->capacity(), KEY_NUM * 4);

  dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                     d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(dump_counter, expected_size);

  CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
  {
    V** d_vectors_ptr = nullptr;
    CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
    table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);
    test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                             stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaFree(d_vectors_ptr));
  }
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int found_num = 0;

  CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(
      cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K), cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                        cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));
  for (int i = 0; i < KEY_NUM; i++) {
    if (h_found[i]) {
      found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
  }
  ASSERT_EQ(found_num, KEY_NUM);

  table->clear(stream);
  total_size = table->size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(total_size, 0);
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors,
                                          int key_start = 0) {
  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
  constexpr uint64_t INIT_CAPACITY = 4 * 1024;
  constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 256;
  constexpr uint64_t THREAD_N = 8;

  std::vector<std::thread> threads;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_load_factor = 0.50f;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::shared_ptr<Table> table = std::make_shared<Table>();
  table->init(options);

  auto worker_function = [&table, KEY_NUM, options](int task_n) {
    K* h_keys;
    V* h_vectors;
    bool* h_found;

    size_t current_capacity = table->capacity();

    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

    K* d_keys;
    V* d_vectors;
    bool* d_found;

    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    while (table->capacity() < MAX_CAPACITY) {
      test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,
                                                  KEY_NUM);
      CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

      find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, nullptr, d_vectors,
                              options.dim, stream);

      CUDA_CHECK(cudaStreamSynchronize(stream));

      CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      {
        V** d_vectors_ptr = nullptr;
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
        table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
        test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                                 stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));
      int found_num = 0;

      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));

      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) {
          found_num++;
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      static_cast<float>(h_keys[i] * 0.00001));
          }
        }
      }
      ASSERT_EQ(found_num, KEY_NUM);
      if (task_n == 0 && current_capacity != table->capacity()) {
        std::cout << "[test_dynamic_rehash_on_multi_threads] The capacity "
                     "changed from "
                  << current_capacity << " to " << table->capacity()
                  << std::endl;
        current_capacity = table->capacity();
      }
      CUDA_CHECK(cudaStreamSynchronize(stream));
    }
    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaFreeHost(h_keys));
    CUDA_CHECK(cudaFreeHost(h_found));
    CUDA_CHECK(cudaFreeHost(h_vectors));

    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_vectors));
    CUDA_CHECK(cudaFree(d_found));
    CUDA_CHECK(cudaDeviceSynchronize());

    CudaCheckError();
  };

  for (int i = 0; i < THREAD_N; ++i)
    threads.emplace_back(std::thread(worker_function, i));

  for (auto& th : threads) {
    th.join();
  }
  ASSERT_EQ(table->capacity(), MAX_CAPACITY);
}

void test_export_batch_if(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 256UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 128UL;
  constexpr uint64_t TEST_TIMES = 1;
  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  size_t h_dump_counter = 0;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  size_t* d_dump_counter;
  int found_num = 0;
  bool* h_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));

  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  S threshold = test_util::host_nano<S>();
  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                                KEY_NUM);

    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, nullptr, d_vectors,
                            options.dim, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    found_num = 0;
    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {
      if (h_found[i]) {
        found_num++;
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    K pattern = 100;

    table->template export_batch_if<ExportIfPredFunctor>(
        pattern, threshold, table->capacity(), 0, d_dump_counter, d_keys,
        d_vectors, d_scores, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));

    size_t expected_export_count = 0;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_scores[i] > threshold) expected_export_count++;
    }
    ASSERT_EQ(expected_export_count, h_dump_counter);

    table->template export_batch_if<ExportIfPredFunctor>(
        pattern, test_util::host_nano<S>(), table->capacity(), 0,
        d_dump_counter, d_keys, d_vectors, d_scores, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),
                          cudaMemcpyDeviceToHost));

    ASSERT_EQ(0, h_dump_counter);

    CUDA_CHECK(cudaMemset(h_keys, 0, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < h_dump_counter; i++) {
      ASSERT_GT(h_scores[i], threshold);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaFree(d_dump_counter));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors,
                                   int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S start_ts;
  S end_ts;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        K** d_key_ptrs = nullptr;
        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));
        CUDA_CHECK(cudaMalloc(&d_key_ptrs, BASE_KEY_NUM * sizeof(K*)));
        start_ts = test_util::host_nano<S>(stream);
        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              nullptr, stream, true, false, d_key_ptrs);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, BASE_KEY_NUM, stream);
        table->unlock_keys(BASE_KEY_NUM, d_key_ptrs, d_keys_temp, d_found,
                           stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        end_ts = test_util::host_nano<S>(stream);
        CUDA_CHECK(cudaFree(d_key_ptrs));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      ASSERT_GE(h_scores_temp_sorted[0], start_ts);
      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1], end_ts);
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      start_ts = test_util::host_nano<S>(stream);
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr, stream);

      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        K** d_key_ptrs = nullptr;
        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));
        CUDA_CHECK(cudaMalloc(&d_key_ptrs, TEST_KEY_NUM * sizeof(K*)));
        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              nullptr, stream, true, false, d_key_ptrs);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, TEST_KEY_NUM, stream);
        table->unlock_keys(TEST_KEY_NUM, d_key_ptrs, d_keys_temp, d_found,
                           stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        end_ts = test_util::host_nano<S>(stream);
        CUDA_CHECK(cudaFree(d_key_ptrs));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(TEST_KEY_NUM);
      int ctr = 0;
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {
          ASSERT_GT(h_scores_temp[i], BUCKET_MAX_SIZE);
          h_scores_temp_sorted[ctr++] = h_scores_temp[i];
        } else {
          ASSERT_LE(h_scores_temp[i], start_ts);
        }
      }
      std::sort(h_scores_temp_sorted.begin(),
                h_scores_temp_sorted.begin() + ctr);

      ASSERT_GE(h_scores_temp_sorted[0], start_ts);
      ASSERT_LE(h_scores_temp_sorted[ctr - 1], end_ts);
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors,
                                   int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  int freq_range = 1000;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,
      freq_range);

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD, freq_range);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  h_scores_test[2] = h_keys_base[72] % freq_range;
  h_scores_test[3] = h_keys_base[73] % freq_range;

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));

      find_or_insert_safe_ptr(table.get(), BASE_KEY_NUM, d_keys_temp,
                              d_scores_temp, d_vectors_temp, options.dim,
                              stream);

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        ASSERT_EQ(h_scores_temp[i], h_keys_temp[i] % freq_range);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,
                    stream);
      find_or_insert_safe_ptr(table.get(), TEST_KEY_NUM, d_keys_temp,
                              d_scores_temp, d_vectors_temp, options.dim,
                              stream);

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        bool in_base =
            h_keys_base.end() !=
            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);
        bool in_test =
            h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);
        if (in_base && in_test) {
          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range) *
                                          3);  // update score when found.
        } else {
          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range));
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors,
                                        int key_start = 0) {
  constexpr int RSHIFT_ON_NANO = 20;

  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S start_ts;
  S end_ts;

  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        K** d_key_ptrs = nullptr;
        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));
        CUDA_CHECK(cudaMalloc(&d_key_ptrs, BASE_KEY_NUM * sizeof(K*)));

        start_ts =
            (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
        table->set_global_epoch(global_epoch);
        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              nullptr, stream, true, false, d_key_ptrs);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, BASE_KEY_NUM, stream);
        table->unlock_keys(BASE_KEY_NUM, d_key_ptrs, d_keys_temp, d_found,
                           stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        end_ts =
            (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
        CUDA_CHECK(cudaFree(d_key_ptrs));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));
      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1],
                (global_epoch << 32 | end_ts));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      start_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
      table->set_global_epoch(global_epoch);
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr, stream);
      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        K** d_key_ptrs = nullptr;
        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));
        CUDA_CHECK(cudaMalloc(&d_key_ptrs, TEST_KEY_NUM * sizeof(K*)));

        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              nullptr, stream, true, false, d_key_ptrs);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, TEST_KEY_NUM, stream);
        table->unlock_keys(TEST_KEY_NUM, d_key_ptrs, d_keys_temp, d_found,
                           stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));

        end_ts =
            (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
        CUDA_CHECK(cudaFree(d_key_ptrs));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(TEST_KEY_NUM);
      int ctr = 0;
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {
          ASSERT_GE(h_scores_temp[i], (global_epoch << 32 | start_ts));
          h_scores_temp_sorted[ctr++] = h_scores_temp[i];
        } else {
          ASSERT_LE(h_scores_temp[i], (global_epoch << 32 | start_ts));
        }
      }
      std::sort(h_scores_temp_sorted.begin(),
                h_scores_temp_sorted.begin() + ctr);

      ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));
      ASSERT_LE(h_scores_temp_sorted[ctr - 1], (global_epoch << 32 | end_ts));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors,
                                        int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  int freq_range = 1000;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,
      freq_range);

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD, freq_range);

  // Simulate overflow of low 32bits.
  h_scores_base[71] = static_cast<S>(std::numeric_limits<uint32_t>::max() -
                                     static_cast<uint32_t>(1));

  h_keys_test[1] = h_keys_base[71];
  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  h_scores_test[1] = h_scores_base[71];
  h_scores_test[2] = h_keys_base[72] % freq_range;
  h_scores_test[3] = h_keys_base[73] % freq_range;

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[1 * options.dim + i] = h_vectors_base[71 * options.dim + i];
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));

      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        K** d_key_ptrs = nullptr;
        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));
        CUDA_CHECK(cudaMalloc(&d_key_ptrs, BASE_KEY_NUM * sizeof(K*)));

        table->set_global_epoch(global_epoch);
        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              d_scores_temp, stream, true, false, d_key_ptrs);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, BASE_KEY_NUM, stream);
        table->unlock_keys(BASE_KEY_NUM, d_key_ptrs, d_keys_temp, d_found,
                           stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_key_ptrs));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        if (h_keys_temp[i] == h_keys_base[71]) {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, h_scores_base[71]);
          ASSERT_EQ(h_scores_temp[i], expected_score);
        } else {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, (h_keys_temp[i] % freq_range));
          ASSERT_EQ(h_scores_temp[i], expected_score);
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->set_global_epoch(global_epoch);
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,
                    stream);
      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        K** d_key_ptrs = nullptr;
        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));
        CUDA_CHECK(cudaMalloc(&d_key_ptrs, TEST_KEY_NUM * sizeof(K*)));
        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              d_scores_temp, stream, true, false, d_key_ptrs);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, TEST_KEY_NUM, stream);
        table->unlock_keys(TEST_KEY_NUM, d_key_ptrs, d_keys_temp, d_found,
                           stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_key_ptrs));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      ASSERT_TRUE(h_keys_temp.end() != std::find(h_keys_temp.begin(),
                                                 h_keys_temp.end(),
                                                 h_keys_base[71]));

      for (int i = 0; i < dump_counter; i++) {
        bool in_base =
            h_keys_base.end() !=
            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);
        bool in_test =
            h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);

        if (in_base && in_test) {
          if (h_keys_temp[i] == h_keys_base[71]) {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch, h_scores_base[71] * 2);
            ASSERT_EQ(h_scores_temp[i], expected_score);
          } else {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch,
                (h_keys_temp[i] % freq_range) * 3);  // update score when found.
            ASSERT_EQ(h_scores_temp[i], expected_score);
          }
        } else {
          if (h_keys_temp[i] == h_keys_base[71]) {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch - static_cast<S>(in_base), h_scores_base[71]);
            ASSERT_EQ(h_scores_temp[i], expected_score);
          } else {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch - static_cast<S>(in_base),
                (h_keys_temp[i] % freq_range));

            ASSERT_EQ(h_scores_temp[i], expected_score);
          }
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors,
                                          int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 128;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  const S base_score_start = 1000;
  for (int i = 0; i < BASE_KEY_NUM; i++) {
    h_scores_base[i] = base_score_start + i;
  }

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);
  const S test_score_start = base_score_start + BASE_KEY_NUM;
  for (int i = 0; i < TEST_KEY_NUM; i++) {
    h_scores_test[i] = test_score_start + i;
  }
  for (int i = 64; i < TEST_KEY_NUM; i++) {
    h_keys_test[i] = h_keys_base[i];
    for (int j = 0; j < options.dim; j++) {
      h_vectors_test[i * options.dim + j] = h_vectors_base[i * options.dim + j];
    }
  }

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      find_or_insert_safe_ptr(table.get(), BASE_KEY_NUM, d_keys_temp,
                              d_scores_temp, d_vectors_temp, options.dim,
                              stream);

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,
                    stream);
      find_or_insert_safe_ptr(table.get(), TEST_KEY_NUM, d_keys_temp,
                              d_scores_temp, d_vectors_temp, options.dim,
                              stream);

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range_test =
          test_util::range<S, TEST_KEY_NUM>(test_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range_test.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors,
                                             int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 8;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 256;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  const S base_score_start = 1000;
  for (int i = 0; i < BASE_KEY_NUM; i++) {
    h_scores_base[i] = base_score_start + i;
  }

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[4] = h_keys_base[72];
  h_keys_test[5] = h_keys_base[73];
  h_keys_test[6] = h_keys_base[74];
  h_keys_test[7] = h_keys_base[75];

  // replace four new keys to lower scores, would not be inserted.
  h_scores_test[0] = 20;
  h_scores_test[1] = 78;
  h_scores_test[2] = 97;
  h_scores_test[3] = 98;

  // replace three exist keys to new scores, just refresh the score for them.
  h_scores_test[4] = 99;
  h_scores_test[5] = 1010;
  h_scores_test[6] = 1020;
  h_scores_test[7] = 1035;

  for (int i = 4; i < TEST_KEY_NUM; i++) {
    for (int j = 0; j < options.dim; j++) {
      h_vectors_test[i * options.dim + j] =
          static_cast<V>(h_keys_test[i] * 0.00001);
    }
  }

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      find_or_insert_safe_ptr(table.get(), BASE_KEY_NUM, d_keys_temp,
                              d_scores_temp, d_vectors_temp, options.dim,
                              stream);

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,
                    stream);
      find_or_insert_safe_ptr(table.get(), TEST_KEY_NUM, d_keys_temp,
                              d_scores_temp, d_vectors_temp, options.dim,
                              stream);

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < TEST_KEY_NUM; i++) {
        if (i < 4) {
          ASSERT_EQ(h_keys_temp.end(),
                    std::find(h_keys_temp.begin(), h_keys_temp.end(),
                              h_keys_test[i]));
        } else {
          ASSERT_NE(h_keys_temp.end(),
                    std::find(h_keys_temp.begin(), h_keys_temp.end(),
                              h_keys_test[i]));
        }
      }
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_temp[i] == h_keys_test[4])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[4]);
        if (h_keys_temp[i] == h_keys_test[5])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[5]);
        if (h_keys_temp[i] == h_keys_test[6])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[6]);
        if (h_keys_temp[i] == h_keys_test[7])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[7]);

        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors,
                                                 int key_start = 0) {
  constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;
  constexpr uint64_t STEPS = 128;
  constexpr uint64_t MAX_BUCKET_SIZE = 128;
  constexpr uint64_t INIT_CAPACITY = BATCH_SIZE * STEPS;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t TEST_TIMES = 1;
  float expected_correct_rate = 0.964;
  const int rounds = 12;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = MAX_BUCKET_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  K* h_keys_base = test_util::HostBuffer<K>(BATCH_SIZE).ptr();
  S* h_scores_base = test_util::HostBuffer<S>(BATCH_SIZE).ptr();
  V* h_vectors_base = test_util::HostBuffer<V>(BATCH_SIZE * options.dim).ptr();

  K* h_keys_temp = test_util::HostBuffer<K>(MAX_CAPACITY).ptr();
  S* h_scores_temp = test_util::HostBuffer<S>(MAX_CAPACITY).ptr();
  V* h_vectors_temp =
      test_util::HostBuffer<V>(MAX_CAPACITY * options.dim).ptr();

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, MAX_CAPACITY * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, MAX_CAPACITY * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, MAX_CAPACITY * sizeof(V) * options.dim));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t global_start_key = 100000;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    size_t start_key = global_start_key;

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    for (int r = 0; r < rounds; r++) {
      size_t expected_min_key = global_start_key + INIT_CAPACITY * r;
      size_t expected_max_key = global_start_key + INIT_CAPACITY * (r + 1) - 1;
      size_t expected_table_size =
          (r == 0) ? size_t(expected_correct_rate * INIT_CAPACITY)
                   : INIT_CAPACITY;

      for (int s = 0; s < STEPS; s++) {
        test_util::create_continuous_keys<K, S, V, DIM>(
            h_keys_base, h_scores_base, h_vectors_base, BATCH_SIZE, start_key);
        start_key += BATCH_SIZE;

        CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base, BATCH_SIZE * sizeof(K),
                              cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base,
                              BATCH_SIZE * sizeof(S), cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base,
                              BATCH_SIZE * sizeof(V) * options.dim,
                              cudaMemcpyHostToDevice));
        table->assign(BATCH_SIZE, d_keys_temp, d_vectors_temp, d_scores_temp,
                      stream);
        find_or_insert_safe_ptr(table.get(), BATCH_SIZE, d_keys_temp,
                                d_scores_temp, d_vectors_temp, options.dim,
                                stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
      }

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_GE(total_size, expected_table_size);
      ASSERT_EQ(MAX_CAPACITY, table->capacity());

      size_t dump_counter = table->export_batch(
          MAX_CAPACITY, 0, d_keys_temp, d_vectors_temp, d_scores_temp, stream);

      CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys_temp, MAX_CAPACITY * sizeof(K),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_scores_temp, d_scores_temp,
                            MAX_CAPACITY * sizeof(S), cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors_temp,
                            MAX_CAPACITY * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));

      size_t bigger_score_counter = 0;
      K max_key = 0;
      size_t values_error_counter = 0;
      for (int i = 0; i < dump_counter; i++) {
        ASSERT_EQ(h_keys_temp[i], h_scores_temp[i]);
        max_key = std::max(max_key, h_keys_temp[i]);
        if (h_scores_temp[i] >= expected_min_key) bigger_score_counter++;
        for (int j = 0; j < options.dim; j++) {
          if (h_vectors_temp[i * options.dim + j] !=
              static_cast<float>(h_keys_temp[i] * 0.00001)) {
            values_error_counter++;
          }
        }
      }

      ASSERT_EQ(values_error_counter, 0);
      float correct_rate = (bigger_score_counter * 1.0) / MAX_CAPACITY;
      std::cout << std::setprecision(3) << "[Round " << r << "]"
                << "correct_rate=" << correct_rate << std::endl;
      ASSERT_GE(max_key, expected_max_key);
      ASSERT_GE(correct_rate, expected_correct_rate);
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_find_or_insert_multi_threads(size_t max_hbm_for_vectors,
                                       const float BATCH_0_RATIO,
                                       const float BATCH_1_RATIO,
                                       bool capacity_silent = true) {
  constexpr uint64_t THREAD_N = 64UL;
  const uint64_t BATCH_0_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_0_RATIO);
  const uint64_t BATCH_1_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_1_RATIO);
  const uint64_t BATCH_2_SIZE = THREAD_N - BATCH_0_SIZE - BATCH_1_SIZE;

  constexpr uint64_t INIT_CAPACITY = 32 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = 128 * 1024 * 1024UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;

  std::vector<std::thread> threads;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_load_factor = 0.50f;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::shared_ptr<Table> table = std::make_shared<Table>();
  table->init(options);
  // assert every key is different
  auto worker1 = [&table, KEY_NUM, options, capacity_silent](int batch,
                                                             int task_n) {
    K* h_keys;
    V* h_vectors;
    bool* h_found;

    size_t current_capacity = table->capacity();

    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

    K* d_keys;
    V* d_vectors;
    bool* d_found;

    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,
                                                KEY_NUM);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    table->assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
    {
      int found_num = 0;
      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) {
          found_num++;
        }
      }
      ASSERT_EQ(found_num, 0);
    }
    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, nullptr, d_vectors,
                            options.dim, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    thread_local bool print_unequal{false};
    thread_local uint64_t err_times{0};
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        for (int j = 0; j < options.dim; j++) {
          if (batch == 2) {
            if (h_vectors[i * options.dim + j] !=
                static_cast<float>(h_keys[i] * 0.00001)) {
              if (!print_unequal) {
                std::cout << " [Thread " << task_n << "]\t";
                UNEQUAL_EXPR(h_vectors[i * options.dim + j],
                             static_cast<float>(h_keys[i] * 0.00001));
                print_unequal = true;
              }
              err_times += 1;
            }
          } else {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      static_cast<float>(h_keys[i] * 0.00001));
          }
        }
      }
    }

    bool print_thread_id{false};
    if (batch == 0 || batch == 1) {
      ASSERT_EQ(found_num, KEY_NUM);
      ASSERT_EQ(err_times, 0);
    } else {
      if (found_num != KEY_NUM or err_times != 0) {
        std::cout << " [Thread " << task_n << "]\t"
                  << "Number of keys(insert/found/error) : " << "(" << KEY_NUM
                  << "/" << found_num << "/" << err_times << ") \t";
        print_thread_id = true;
      }
    }
    if (current_capacity != table->capacity() && !capacity_silent) {
      if (!print_thread_id) std::cout << " [Thread " << task_n << "]\t";

      std::cout << "The capacity changed from " << current_capacity << " to "
                << table->capacity() << std::endl;
    } else if (print_thread_id) {
      std::cout << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaFreeHost(h_keys));
    CUDA_CHECK(cudaFreeHost(h_found));
    CUDA_CHECK(cudaFreeHost(h_vectors));

    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_vectors));
    CUDA_CHECK(cudaFree(d_found));
    CUDA_CHECK(cudaDeviceSynchronize());

    CudaCheckError();
  };
  auto worker2 = [&table, KEY_NUM, options, capacity_silent](int batch,
                                                             int task_n) {
    K* h_keys;
    V* h_vectors;
    bool* h_found;

    size_t current_capacity = table->capacity();

    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

    K* d_keys;
    V* d_vectors;
    V* d_new_vectors;
    bool* d_found;

    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,
                                                KEY_NUM);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));

    find_or_insert_safe_ptr(table.get(), KEY_NUM, d_keys, nullptr, d_vectors,
                            options.dim, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));
    table->assign(KEY_NUM, d_keys, d_new_vectors, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    thread_local bool print_unequal{false};
    thread_local uint64_t err_times{0};
    uint32_t i_value = 0x2020202;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        for (int j = 0; j < options.dim; j++) {
          if (batch == 2) {
            if (h_vectors[i * options.dim + j] !=
                *(reinterpret_cast<float*>(&i_value))) {
              if (!print_unequal) {
                std::cout << " [Thread " << task_n << "]\t";
                UNEQUAL_EXPR(h_vectors[i * options.dim + j],
                             *(reinterpret_cast<float*>(&i_value)));
                print_unequal = true;
              }
              err_times += 1;
            }
          } else {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      *(reinterpret_cast<float*>(&i_value)));
          }
        }
      }
    }

    bool print_thread_id{false};
    if (batch == 0 || batch == 1) {
      ASSERT_EQ(found_num, KEY_NUM);
      ASSERT_EQ(err_times, 0);
    } else {
      if (found_num != KEY_NUM or err_times != 0) {
        std::cout << " [Thread " << task_n << "]\t"
                  << "Number of keys(insert/found/error) : " << "(" << KEY_NUM
                  << "/" << found_num << "/" << err_times << ") \t";
        print_thread_id = true;
      }
    }
    if (current_capacity != table->capacity() && !capacity_silent) {
      if (!print_thread_id) std::cout << " [Thread " << task_n << "]\t";

      std::cout << "The capacity changed from " << current_capacity << " to "
                << table->capacity() << std::endl;
    } else if (print_thread_id) {
      std::cout << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaFreeHost(h_keys));
    CUDA_CHECK(cudaFreeHost(h_found));
    CUDA_CHECK(cudaFreeHost(h_vectors));

    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_vectors));
    CUDA_CHECK(cudaFree(d_new_vectors));
    CUDA_CHECK(cudaFree(d_found));
    CUDA_CHECK(cudaDeviceSynchronize());

    CudaCheckError();
  };

  /* the table is relative idle, and assume there is no eviction */
  int batch = 0;
  std::cout << "[Batch 0] " << BATCH_0_SIZE << " threads\n";
  for (int i = 0; i < BATCH_0_SIZE; i += 2) {
    threads.emplace_back(std::thread(worker1, batch, i));
    threads.emplace_back(std::thread(worker2, batch, i + 1));
  }
  for (auto& th : threads) {
    th.join();
  }
  threads.clear();

  /* test the correct of APIs serially */
  batch = 1;
  std::cout << "[Batch 1] " << BATCH_1_SIZE << " threads\n";
  for (int i = BATCH_0_SIZE; i < BATCH_0_SIZE + BATCH_1_SIZE; i += 2) {
    auto th = std::thread(worker1, batch, i);
    th.join();
    th = std::thread(worker2, batch, i + 1);
    th.join();
  }

  /* eviction may occur */
  batch = 2;
  std::cout << "[Batch 2] " << BATCH_2_SIZE << " threads\n";
  for (int i = BATCH_0_SIZE + BATCH_1_SIZE; i < THREAD_N; i += 2) {
    threads.emplace_back(std::thread(worker1, batch, i));
    threads.emplace_back(std::thread(worker2, batch, i + 1));
  }
  for (auto& th : threads) {
    th.join();
  }
  ASSERT_EQ(table->capacity(), MAX_CAPACITY);
}

template <typename K, typename V, typename S, typename Table, size_t dim = 64>
void CheckFindOrInsertValues(Table* table, K* keys, V* values, S* scores,
                             size_t len, cudaStream_t stream) {
  std::map<K, test_util::ValueArray<V, dim>> map_before_insert;
  std::map<K, test_util::ValueArray<V, dim>> map_after_insert;
  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;

  size_t table_size_before = table->size(stream);
  size_t cap = table_size_before + len;

  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));
  h_tmp_keys = (K*)malloc(cap * sizeof(K));
  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
  h_tmp_scores = (S*)malloc(cap * sizeof(S));

  size_t table_size_verify0 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);
  ASSERT_EQ(table_size_before, table_size_verify0);

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_before * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_before * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_before * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < table_size_verify0; i++) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    map_before_insert[h_tmp_keys[i]] = *vec;
  }

  auto start = std::chrono::steady_clock::now();

  find_or_insert_safe_ptr(table, len, d_tmp_keys, nullptr, values, dim, stream);

  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::steady_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

  float dur = diff.count();

  size_t table_size_after = table->size(stream);
  size_t table_size_verify1 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

  ASSERT_EQ(table_size_verify1, table_size_after);

  size_t new_cap = table_size_after;
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_after * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_after * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_after * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int64_t new_cap_K = (int64_t)new_cap;
  for (int64_t i = new_cap_K - 1; i >= 0; i--) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    map_after_insert[h_tmp_keys[i]] = *vec;
  }

  size_t value_diff_cnt = 0;
  for (auto& it : map_after_insert) {
    test_util::ValueArray<V, dim>& vec = map_after_insert.at(it.first);
    for (size_t j = 0; j < dim; j++) {
      if (vec[j] != static_cast<float>(it.first * 0.00001)) {
        ++value_diff_cnt;
        break;
      }
    }
  }
  ASSERT_EQ(value_diff_cnt, 0);
  std::cout << "Check find_or_insert behavior got "
            << "value_diff_cnt: " << value_diff_cnt
            << ", while table_size_before: " << table_size_before
            << ", while table_size_after: " << table_size_after
            << ", while len: " << len << std::endl;

  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
  free(h_tmp_keys);
  free(h_tmp_values);
  free(h_tmp_scores);
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

void test_find_or_insert_values_check(size_t max_hbm_for_vectors) {
  const size_t U = 524288;
  const size_t init_capacity = 1024;
  const size_t B = 524288 + 13;
  constexpr size_t dim = 64;

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;
  opt.dim = 64;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<K, V, S> data_buffer;
  data_buffer.Reserve(B, dim, stream);

  size_t offset = 0;
  S score = 0;
  for (int i = 0; i < 20; i++) {
    test_util::create_random_keys<K, S, V, dim>(
        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),
        data_buffer.values_ptr(false), (int)B, B * 16);
    data_buffer.SyncData(true, stream);

    CheckFindOrInsertValues<K, V, S, Table, dim>(
        table.get(), data_buffer.keys_ptr(), data_buffer.values_ptr(),
        data_buffer.scores_ptr(), B, stream);

    offset += B;
    score += 1;
  }
}

TEST(FindOrInsertPtrTest, test_export_batch_if) {
  test_export_batch_if(16);
  test_export_batch_if(0, 33);
}
TEST(FindOrInsertPtrTest, test_basic) {
  test_basic(16, 3);
  test_basic(0);
}
TEST(FindOrInsertPtrTest, test_basic_when_full) {
  test_basic_when_full(16, 4);
  test_basic_when_full(0);
}
TEST(FindOrInsertPtrTest, test_erase_if_pred) {
  test_erase_if_pred(16);
  test_erase_if_pred(0, 18);
}
TEST(FindOrInsertPtrTest, test_rehash) {
  test_rehash(16);
  test_rehash(0, 44);
}
TEST(FindOrInsertPtrTest, test_rehash_on_big_batch) {
  test_rehash_on_big_batch(16, 23);
  test_rehash_on_big_batch(0);
}
TEST(FindOrInsertPtrTest, test_evict_strategy_lru_basic) {
  test_evict_strategy_lru_basic(16);
  test_evict_strategy_lru_basic(0, 18);
}
TEST(FindOrInsertPtrTest, test_evict_strategy_lfu_basic) {
  test_evict_strategy_lfu_basic(16, 29);
  test_evict_strategy_lfu_basic(0);
}
TEST(FindOrInsertPtrTest, test_evict_strategy_epochlru_basic) {
  test_evict_strategy_epochlru_basic(16, 45);
  test_evict_strategy_epochlru_basic(0);
}
TEST(FindOrInsertPtrTest, test_evict_strategy_epochlfu_basic) {
  test_evict_strategy_epochlfu_basic(16);
  test_evict_strategy_epochlfu_basic(0, 59);
}
TEST(FindOrInsertPtrTest, test_evict_strategy_customized_basic) {
  test_evict_strategy_customized_basic(16, 38);
  test_evict_strategy_customized_basic(0);
}
TEST(FindOrInsertPtrTest, test_evict_strategy_customized_advanced) {
  test_evict_strategy_customized_advanced(16);
  test_evict_strategy_customized_advanced(0, 25);
}
TEST(FindOrInsertPtrTest, test_evict_strategy_customized_correct_rate) {
  // TODO(rhdong): after blossom CI issue is resolved, the skip logic.
  const bool skip_hmem_check = (nullptr != std::getenv("IS_BLOSSOM_CI"));
  test_evict_strategy_customized_correct_rate(16, 16);
  if (!skip_hmem_check) {
    test_evict_strategy_customized_correct_rate(0);
  } else {
    std::cout << "The HMEM check is skipped in blossom CI!" << std::endl;
  }
}

// Turn on to verify that it can't deal with multi-threads cases
// TEST(FindOrInsertPtrTest, test_find_or_insert_multi_threads) {
//   test_find_or_insert_multi_threads(16, 0.25f, 0.125f);
//   test_find_or_insert_multi_threads(16, 0.375f, 0.125f);
//   test_find_or_insert_multi_threads(0, 0.25f, 0.125f);
//   test_find_or_insert_multi_threads(0, 0.375f, 0.125f);
// }
// TEST(FindOrInsertPtrTest, test_dynamic_rehash_on_multi_threads) {
//   test_dynamic_rehash_on_multi_threads(16);
//   test_dynamic_rehash_on_multi_threads(0, 19);
// }

// Turn on to verify that it can't deal with small capacity case
// TEST(FindOrInsertPtrTest, test_find_or_insert_values_check) {
//   test_find_or_insert_values_check(16);
//   // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.
//   test_find_or_insert_values_check(0);
// }

================================================
FILE: tests/find_or_insert_ptr_test.cc.cu
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * test APIs: find_or_insert and assign,
 * move insert operation from `insert_or_assign` to `find`.
 */

#include <gtest/gtest.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <array>
#include <iostream>
#include <thread>
#include <vector>
#include "merlin_hashtable.cuh"
#include "test_util.cuh"

constexpr size_t DIM = 16;
using K = uint64_t;
using V = float;
using S = uint64_t;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;

template <class K, class S>
struct EraseIfPredFunctor {
  __forceinline__ __device__ bool operator()(const K& key, S& score,
                                             const K& pattern,
                                             const S& threshold) {
    return ((key & 0x7f > pattern) && (score > threshold));
  }
};

template <class K, class S>
struct ExportIfPredFunctor {
  __forceinline__ __device__ bool operator()(const K& key, S& score,
                                             const K& pattern,
                                             const S& threshold) {
    return score > threshold;
  }
};

void test_basic(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_new_vectors;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      V** d_vectors_ptr = nullptr;
      bool* d_found;
      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));

      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,
                            stream);
      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,
                                   options.dim, KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
      CUDA_CHECK(cudaFree(d_found));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }

    int found_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));
    table->assign(KEY_NUM, reinterpret_cast<const K*>(d_keys),
                  reinterpret_cast<const float*>(d_new_vectors),
                  reinterpret_cast<const S*>(d_scores), stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(d_new_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      test_util::array2ptr(d_vectors_ptr, d_new_vectors, options.dim, KEY_NUM,
                           stream);
      table->find(KEY_NUM, d_keys, d_new_vectors, d_found, d_scores, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_new_vectors, options.dim,
                               KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_new_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    found_num = 0;
    uint32_t i_value = 0x2020202;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  *(reinterpret_cast<float*>(&i_value)));
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,
                           stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    table->erase(KEY_NUM >> 1, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, total_size >> 1);

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      V** d_vectors_ptr = nullptr;
      bool* d_found;
      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,
                            stream);
      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,
                                   options.dim, KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
      CUDA_CHECK(cudaFree(d_found));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    found_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_keys, 0, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                       d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    ASSERT_EQ(dump_counter, KEY_NUM);
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_new_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_basic_when_full(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, nullptr,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_def_val;
  V** d_vectors_ptr;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      V** d_vectors_ptr = nullptr;
      bool* d_found;
      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,
                            stream);
      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,
                                   options.dim, KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
      CUDA_CHECK(cudaFree(d_found));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_insert = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    table->erase(KEY_NUM, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, 0);

    {
      V** d_vectors_ptr = nullptr;
      bool* d_found;
      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,
                            stream);
      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,
                                   options.dim, KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
      CUDA_CHECK(cudaFree(d_found));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_reinsert = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_insert, total_size_after_reinsert);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_def_val));
  CUDA_CHECK(cudaFree(d_vectors_ptr));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_erase_if_pred(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 256UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 128UL;
  constexpr uint64_t TEST_TIMES = 1;
  constexpr uint64_t BUCKET_MAX_SIZE = 128;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    test_util::create_keys_in_one_buckets<K, S, V, DIM>(
        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      V** d_vectors_ptr = nullptr;
      bool* d_found;
      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,
                            stream);
      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,
                                   options.dim, KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
      CUDA_CHECK(cudaFree(d_found));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

    K pattern = 100;
    S threshold = 0;
    size_t erase_num = table->template erase_if<EraseIfPredFunctor>(
        pattern, threshold, stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ((erase_num + total_size), BUCKET_MAX_SIZE);

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        ASSERT_EQ(h_scores[i], h_keys[i]);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, (BUCKET_MAX_SIZE - erase_num));

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_rehash(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
  constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE;
  constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = BUCKET_MAX_SIZE * 2;
  constexpr uint64_t TEST_TIMES = 100;
  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    test_util::create_keys_in_one_buckets<K, S, V, DIM>(
        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      V** d_vectors_ptr = nullptr;
      bool* d_found;
      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,
                            stream);
      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,
                                   options.dim, KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
      CUDA_CHECK(cudaFree(d_found));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaDeviceSynchronize());
    ASSERT_EQ(total_size, KEY_NUM);

    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                       d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(dump_counter, KEY_NUM);

    table->reserve(MAX_CAPACITY, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(table->capacity(), MAX_CAPACITY);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BUCKET_MAX_SIZE * sizeof(V*)));
      table->find(BUCKET_MAX_SIZE, d_keys, d_vectors_ptr, d_found, d_scores,
                  stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim,
                               BUCKET_MAX_SIZE, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {
      if (h_found[i]) {
        found_num++;
        ASSERT_EQ(h_scores[i], h_keys[i]);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, BUCKET_MAX_SIZE);

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_rehash_on_big_batch(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 1024;
  constexpr uint64_t MAX_CAPACITY = 16 * 1024;
  constexpr uint64_t INIT_KEY_NUM = 1024;
  constexpr uint64_t KEY_NUM = 2048;
  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = 128;
  options.max_load_factor = 0.6;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  uint64_t expected_size = 0;
  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  total_size = table->size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(total_size, 0);

  {
    V** d_vectors_ptr = nullptr;
    bool* d_found;
    CUDA_CHECK(cudaMalloc(&d_found, INIT_KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMalloc(&d_vectors_ptr, INIT_KEY_NUM * sizeof(V*)));
    table->find_or_insert(INIT_KEY_NUM, d_keys, d_vectors_ptr, d_found,
                          d_scores, stream);
    test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found, options.dim,
                                 INIT_KEY_NUM, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaFree(d_vectors_ptr));
    CUDA_CHECK(cudaFree(d_found));
  }

  CUDA_CHECK(cudaStreamSynchronize(stream));
  expected_size = INIT_KEY_NUM;

  total_size = table->size(stream);
  CUDA_CHECK(cudaDeviceSynchronize());
  ASSERT_EQ(total_size, expected_size);
  ASSERT_EQ(table->capacity(), (INIT_CAPACITY * 2));

  {
    V** d_vectors_ptr = nullptr;
    bool* d_found;
    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
    table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,
                          stream);
    test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found, options.dim,
                                 KEY_NUM, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaFree(d_vectors_ptr));
    CUDA_CHECK(cudaFree(d_found));
  }

  CUDA_CHECK(cudaStreamSynchronize(stream));
  expected_size = KEY_NUM;

  total_size = table->size(stream);
  CUDA_CHECK(cudaDeviceSynchronize());
  ASSERT_EQ(total_size, expected_size);
  ASSERT_EQ(table->capacity(), KEY_NUM * 4);

  dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                     d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(dump_counter, expected_size);

  CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
  {
    V** d_vectors_ptr = nullptr;
    CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
    table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);
    test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                             stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaFree(d_vectors_ptr));
  }
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int found_num = 0;

  CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(
      cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K), cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                        cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));
  for (int i = 0; i < KEY_NUM; i++) {
    if (h_found[i]) {
      found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
  }
  ASSERT_EQ(found_num, KEY_NUM);

  table->clear(stream);
  total_size = table->size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(total_size, 0);
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors,
                                          int key_start = 0) {
  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
  constexpr uint64_t INIT_CAPACITY = 4 * 1024;
  constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 256;
  constexpr uint64_t THREAD_N = 8;

  std::vector<std::thread> threads;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_load_factor = 0.50f;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::shared_ptr<Table> table = std::make_shared<Table>();
  table->init(options);

  auto worker_function = [&table, KEY_NUM, options](int task_n) {
    K* h_keys;
    V* h_vectors;
    bool* h_found;

    size_t current_capacity = table->capacity();

    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

    K* d_keys;
    V* d_vectors;
    bool* d_found;

    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    while (table->capacity() < MAX_CAPACITY) {
      test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,
                                                  KEY_NUM);
      CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
        table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr,
                              stream);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,
                                     options.dim, KEY_NUM, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      {
        V** d_vectors_ptr = nullptr;
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
        table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
        test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                                 stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));
      int found_num = 0;

      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));

      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) {
          found_num++;
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      static_cast<float>(h_keys[i] * 0.00001));
          }
        }
      }
      ASSERT_EQ(found_num, KEY_NUM);
      if (task_n == 0 && current_capacity != table->capacity()) {
        std::cout << "[test_dynamic_rehash_on_multi_threads] The capacity "
                     "changed from "
                  << current_capacity << " to " << table->capacity()
                  << std::endl;
        current_capacity = table->capacity();
      }
      CUDA_CHECK(cudaStreamSynchronize(stream));
    }
    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaFreeHost(h_keys));
    CUDA_CHECK(cudaFreeHost(h_found));
    CUDA_CHECK(cudaFreeHost(h_vectors));

    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_vectors));
    CUDA_CHECK(cudaFree(d_found));
    CUDA_CHECK(cudaDeviceSynchronize());

    CudaCheckError();
  };

  for (int i = 0; i < THREAD_N; ++i)
    threads.emplace_back(std::thread(worker_function, i));

  for (auto& th : threads) {
    th.join();
  }
  ASSERT_EQ(table->capacity(), MAX_CAPACITY);
}

void test_export_batch_if(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 256UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 128UL;
  constexpr uint64_t TEST_TIMES = 1;
  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  size_t h_dump_counter = 0;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  size_t* d_dump_counter;
  int found_num = 0;
  bool* h_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));

  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  S threshold = test_util::host_nano<S>();
  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                                KEY_NUM);

    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      V** d_vectors_ptr = nullptr;
      bool* d_found;
      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr,
                            stream);
      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,
                                   options.dim, KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
      CUDA_CHECK(cudaFree(d_found));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    found_num = 0;
    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {
      if (h_found[i]) {
        found_num++;
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    K pattern = 100;

    table->template export_batch_if<ExportIfPredFunctor>(
        pattern, threshold, table->capacity(), 0, d_dump_counter, d_keys,
        d_vectors, d_scores, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));

    size_t expected_export_count = 0;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_scores[i] > threshold) expected_export_count++;
    }
    ASSERT_EQ(expected_export_count, h_dump_counter);

    table->template export_batch_if<ExportIfPredFunctor>(
        pattern, test_util::host_nano<S>(), table->capacity(), 0,
        d_dump_counter, d_keys, d_vectors, d_scores, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),
                          cudaMemcpyDeviceToHost));

    ASSERT_EQ(0, h_dump_counter);

    CUDA_CHECK(cudaMemset(h_keys, 0, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < h_dump_counter; i++) {
      ASSERT_GT(h_scores[i], threshold);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaFree(d_dump_counter));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_basic_for_cpu_io(int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(0);
  options.io_by_cpu = true;
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, nullptr,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_def_val;
  V** d_vectors_ptr;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      V** d_vectors_ptr = nullptr;
      bool* d_found;
      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,
                            stream);
      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,
                                   options.dim, KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
      CUDA_CHECK(cudaFree(d_found));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_vectors, 2, KEY_NUM * sizeof(V) * options.dim));
    table->assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
    }
    ASSERT_EQ(found_num, KEY_NUM);

    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,
                           stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    table->erase(KEY_NUM >> 1, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, total_size >> 1);

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      V** d_vectors_ptr = nullptr;
      bool* d_found;
      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,
                            stream);
      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,
                                   options.dim, KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
      CUDA_CHECK(cudaFree(d_found));
    }

    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                       d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(dump_counter, KEY_NUM);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_def_val));
  CUDA_CHECK(cudaFree(d_vectors_ptr));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors,
                                   int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S start_ts;
  S end_ts;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));

      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));
        start_ts = test_util::host_nano<S>(stream);
        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              nullptr, stream);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, BASE_KEY_NUM, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        end_ts = test_util::host_nano<S>(stream);
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      ASSERT_GE(h_scores_temp_sorted[0], start_ts);
      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1], end_ts);
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      start_ts = test_util::host_nano<S>(stream);
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr, stream);

      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));
        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              nullptr, stream);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, TEST_KEY_NUM, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        end_ts = test_util::host_nano<S>(stream);
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::array<S, TEMP_KEY_NUM> h_scores_temp_sorted;
      int ctr = 0;
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {
          ASSERT_GT(h_scores_temp[i], BUCKET_MAX_SIZE);
          h_scores_temp_sorted[ctr++] = h_scores_temp[i];
        } else {
          ASSERT_LE(h_scores_temp[i], start_ts);
        }
      }
      std::sort(h_scores_temp_sorted.begin(),
                h_scores_temp_sorted.begin() + ctr);

      ASSERT_GE(h_scores_temp_sorted[0], start_ts);
      ASSERT_LE(h_scores_temp_sorted[ctr - 1], end_ts);
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors,
                                   int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  int freq_range = 1000;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,
      freq_range);

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD, freq_range);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  h_scores_test[2] = h_keys_base[72] % freq_range;
  h_scores_test[3] = h_keys_base[73] % freq_range;

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));
        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              d_scores_temp, stream);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, BASE_KEY_NUM, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        ASSERT_EQ(h_scores_temp[i], h_keys_temp[i] % freq_range);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,
                    stream);
      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));
        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              d_scores_temp, stream);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, TEST_KEY_NUM, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        bool in_base =
            h_keys_base.end() !=
            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);
        bool in_test =
            h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);
        if (in_base && in_test) {
          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range) *
                                          3);  // update score when found.
        } else {
          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range));
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors,
                                        int key_start = 0) {
  constexpr int RSHIFT_ON_NANO = 20;

  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S start_ts;
  S end_ts;

  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));
        start_ts =
            (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
        table->set_global_epoch(global_epoch);
        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              nullptr, stream);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, BASE_KEY_NUM, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        end_ts =
            (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));
      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1],
                (global_epoch << 32 | end_ts));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      start_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
      table->set_global_epoch(global_epoch);
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr, stream);
      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));
        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              nullptr, stream);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, TEST_KEY_NUM, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));

        end_ts =
            (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted;
      h_scores_temp_sorted.reserve(TEMP_KEY_NUM);
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {
          ASSERT_GE(h_scores_temp[i], (global_epoch << 32 | start_ts));
          h_scores_temp_sorted.push_back(h_scores_temp[i]);
        } else {
          ASSERT_LE(h_scores_temp[i], (global_epoch << 32 | start_ts));
        }
      }
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      if (!h_scores_temp_sorted.empty()) {
        ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));
        ASSERT_LE(h_scores_temp_sorted[h_scores_temp_sorted.size() - 1],
                  (global_epoch << 32 | end_ts));
      }
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors,
                                        int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  int freq_range = 1000;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,
      freq_range);

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD, freq_range);

  // Simulate overflow of low 32bits.
  h_scores_base[71] = static_cast<S>(std::numeric_limits<uint32_t>::max() -
                                     static_cast<uint32_t>(1));

  h_keys_test[1] = h_keys_base[71];
  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  h_scores_test[1] = h_scores_base[71];
  h_scores_test[2] = h_keys_base[72] % freq_range;
  h_scores_test[3] = h_keys_base[73] % freq_range;

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[1 * options.dim + i] = h_vectors_base[71 * options.dim + i];
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));
        table->set_global_epoch(global_epoch);
        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              d_scores_temp, stream);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, BASE_KEY_NUM, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        if (h_keys_temp[i] == h_keys_base[71]) {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, h_scores_base[71]);
          ASSERT_EQ(h_scores_temp[i], expected_score);
        } else {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, (h_keys_temp[i] % freq_range));
          ASSERT_EQ(h_scores_temp[i], expected_score);
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->set_global_epoch(global_epoch);
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,
                    stream);
      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));
        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              d_scores_temp, stream);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, TEST_KEY_NUM, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      ASSERT_TRUE(h_keys_temp.end() != std::find(h_keys_temp.begin(),
                                                 h_keys_temp.end(),
                                                 h_keys_base[71]));

      for (int i = 0; i < dump_counter; i++) {
        bool in_base =
            h_keys_base.end() !=
            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);
        bool in_test =
            h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);

        if (in_base && in_test) {
          if (h_keys_temp[i] == h_keys_base[71]) {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch, h_scores_base[71] * 2);
            ASSERT_EQ(h_scores_temp[i], expected_score);
          } else {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch,
                (h_keys_temp[i] % freq_range) * 3);  // update score when found.
            ASSERT_EQ(h_scores_temp[i], expected_score);
          }
        } else {
          if (h_keys_temp[i] == h_keys_base[71]) {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch - static_cast<S>(in_base), h_scores_base[71]);
            ASSERT_EQ(h_scores_temp[i], expected_score);
          } else {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch - static_cast<S>(in_base),
                (h_keys_temp[i] % freq_range));

            ASSERT_EQ(h_scores_temp[i], expected_score);
          }
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors,
                                          int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 128;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  const S base_score_start = 1000;
  for (int i = 0; i < BASE_KEY_NUM; i++) {
    h_scores_base[i] = base_score_start + i;
  }

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);
  const S test_score_start = base_score_start + BASE_KEY_NUM;
  for (int i = 0; i < TEST_KEY_NUM; i++) {
    h_scores_test[i] = test_score_start + i;
  }
  for (int i = 64; i < TEST_KEY_NUM; i++) {
    h_keys_test[i] = h_keys_base[i];
    for (int j = 0; j < options.dim; j++) {
      h_vectors_test[i * options.dim + j] = h_vectors_base[i * options.dim + j];
    }
  }

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));
        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              d_scores_temp, stream);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, BASE_KEY_NUM, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,
                    stream);
      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));
        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              d_scores_temp, stream);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, TEST_KEY_NUM, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range_test =
          test_util::range<S, TEST_KEY_NUM>(test_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range_test.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors,
                                             int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 8;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 256;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  const S base_score_start = 1000;
  for (int i = 0; i < BASE_KEY_NUM; i++) {
    h_scores_base[i] = base_score_start + i;
  }

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[4] = h_keys_base[72];
  h_keys_test[5] = h_keys_base[73];
  h_keys_test[6] = h_keys_base[74];
  h_keys_test[7] = h_keys_base[75];

  // replace four new keys to lower scores, would not be inserted.
  h_scores_test[0] = 20;
  h_scores_test[1] = 78;
  h_scores_test[2] = 97;
  h_scores_test[3] = 98;

  // replace three exist keys to new scores, just refresh the score for them.
  h_scores_test[4] = 99;
  h_scores_test[5] = 1010;
  h_scores_test[6] = 1020;
  h_scores_test[7] = 1035;

  for (int i = 4; i < TEST_KEY_NUM; i++) {
    for (int j = 0; j < options.dim; j++) {
      h_vectors_test[i * options.dim + j] =
          static_cast<V>(h_keys_test[i] * 0.00001);
    }
  }

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        CUDA_CHECK(cudaMalloc(&d_found, BASE_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BASE_KEY_NUM * sizeof(V*)));
        table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              d_scores_temp, stream);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, BASE_KEY_NUM, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,
                    stream);
      {
        V** d_vectors_ptr = nullptr;
        bool* d_found;
        CUDA_CHECK(cudaMalloc(&d_found, TEST_KEY_NUM * sizeof(bool)));
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, TEST_KEY_NUM * sizeof(V*)));
        table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_ptr, d_found,
                              d_scores_temp, stream);
        test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                     options.dim, TEST_KEY_NUM, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
        CUDA_CHECK(cudaFree(d_found));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < TEST_KEY_NUM; i++) {
        if (i < 4) {
          ASSERT_EQ(h_keys_temp.end(),
                    std::find(h_keys_temp.begin(), h_keys_temp.end(),
                              h_keys_test[i]));
        } else {
          ASSERT_NE(h_keys_temp.end(),
                    std::find(h_keys_temp.begin(), h_keys_temp.end(),
                              h_keys_test[i]));
        }
      }
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_temp[i] == h_keys_test[4])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[4]);
        if (h_keys_temp[i] == h_keys_test[5])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[5]);
        if (h_keys_temp[i] == h_keys_test[6])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[6]);
        if (h_keys_temp[i] == h_keys_test[7])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[7]);

        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors,
                                                 int key_start = 0) {
  constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;
  constexpr uint64_t STEPS = 128;
  constexpr uint64_t MAX_BUCKET_SIZE = 128;
  constexpr uint64_t INIT_CAPACITY = BATCH_SIZE * STEPS;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t TEST_TIMES = 1;
  float expected_correct_rate = 0.964;
  const int rounds = 12;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = MAX_BUCKET_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  K* h_keys_base = test_util::HostBuffer<K>(BATCH_SIZE).ptr();
  S* h_scores_base = test_util::HostBuffer<S>(BATCH_SIZE).ptr();
  V* h_vectors_base = test_util::HostBuffer<V>(BATCH_SIZE * options.dim).ptr();

  K* h_keys_temp = test_util::HostBuffer<K>(MAX_CAPACITY).ptr();
  S* h_scores_temp = test_util::HostBuffer<S>(MAX_CAPACITY).ptr();
  V* h_vectors_temp =
      test_util::HostBuffer<V>(MAX_CAPACITY * options.dim).ptr();

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, MAX_CAPACITY * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, MAX_CAPACITY * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, MAX_CAPACITY * sizeof(V) * options.dim));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t global_start_key = 100000;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    size_t start_key = global_start_key;

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    for (int r = 0; r < rounds; r++) {
      size_t expected_min_key = global_start_key + INIT_CAPACITY * r;
      size_t expected_max_key = global_start_key + INIT_CAPACITY * (r + 1) - 1;
      size_t expected_table_size =
          (r == 0) ? size_t(expected_correct_rate * INIT_CAPACITY)
                   : INIT_CAPACITY;

      for (int s = 0; s < STEPS; s++) {
        test_util::create_continuous_keys<K, S, V, DIM>(
            h_keys_base, h_scores_base, h_vectors_base, BATCH_SIZE, start_key);
        start_key += BATCH_SIZE;

        CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base, BATCH_SIZE * sizeof(K),
                              cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base,
                              BATCH_SIZE * sizeof(S), cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base,
                              BATCH_SIZE * sizeof(V) * options.dim,
                              cudaMemcpyHostToDevice));
        table->assign(BATCH_SIZE, d_keys_temp, d_vectors_temp, d_scores_temp,
                      stream);
        {
          V** d_vectors_ptr = nullptr;
          bool* d_found;
          CUDA_CHECK(cudaMalloc(&d_found, BATCH_SIZE * sizeof(bool)));
          CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BATCH_SIZE * sizeof(V*)));
          table->find_or_insert(BATCH_SIZE, d_keys_temp, d_vectors_ptr, d_found,
                                d_scores_temp, stream);
          test_util::read_or_write_ptr(d_vectors_ptr, d_vectors_temp, d_found,
                                       options.dim, BATCH_SIZE, stream);
          CUDA_CHECK(cudaStreamSynchronize(stream));
          CUDA_CHECK(cudaFree(d_vectors_ptr));
          CUDA_CHECK(cudaFree(d_found));
        }
        CUDA_CHECK(cudaStreamSynchronize(stream));
      }

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_GE(total_size, expected_table_size);
      ASSERT_EQ(MAX_CAPACITY, table->capacity());

      size_t dump_counter = table->export_batch(
          MAX_CAPACITY, 0, d_keys_temp, d_vectors_temp, d_scores_temp, stream);

      CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys_temp, MAX_CAPACITY * sizeof(K),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_scores_temp, d_scores_temp,
                            MAX_CAPACITY * sizeof(S), cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors_temp,
                            MAX_CAPACITY * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));

      size_t bigger_score_counter = 0;
      K max_key = 0;
      size_t values_error_counter = 0;
      for (int i = 0; i < dump_counter; i++) {
        ASSERT_EQ(h_keys_temp[i], h_scores_temp[i]);
        max_key = std::max(max_key, h_keys_temp[i]);
        if (h_scores_temp[i] >= expected_min_key) bigger_score_counter++;
        for (int j = 0; j < options.dim; j++) {
          if (h_vectors_temp[i * options.dim + j] !=
              static_cast<float>(h_keys_temp[i] * 0.00001)) {
            values_error_counter++;
          }
        }
      }

      ASSERT_EQ(values_error_counter, 0);
      float correct_rate = (bigger_score_counter * 1.0) / MAX_CAPACITY;
      std::cout << std::setprecision(3) << "[Round " << r << "]"
                << "correct_rate=" << correct_rate << std::endl;
      ASSERT_GE(max_key, expected_max_key);
      ASSERT_GE(correct_rate, expected_correct_rate);
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_find_or_insert_multi_threads(size_t max_hbm_for_vectors,
                                       const float BATCH_0_RATIO,
                                       const float BATCH_1_RATIO,
                                       bool capacity_silent = true) {
  constexpr uint64_t THREAD_N = 64UL;
  const uint64_t BATCH_0_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_0_RATIO);
  const uint64_t BATCH_1_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_1_RATIO);
  const uint64_t BATCH_2_SIZE = THREAD_N - BATCH_0_SIZE - BATCH_1_SIZE;

  constexpr uint64_t INIT_CAPACITY = 32 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = 128 * 1024 * 1024UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;

  std::vector<std::thread> threads;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_load_factor = 0.50f;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::shared_ptr<Table> table = std::make_shared<Table>();
  table->init(options);
  // assert every key is different
  auto worker1 = [&table, KEY_NUM, options, capacity_silent](int batch,
                                                             int task_n) {
    K* h_keys;
    V* h_vectors;
    bool* h_found;

    size_t current_capacity = table->capacity();

    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

    K* d_keys;
    V* d_vectors;
    bool* d_found;

    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,
                                                KEY_NUM);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    table->assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
    {
      int found_num = 0;
      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) {
          found_num++;
        }
      }
      ASSERT_EQ(found_num, 0);
    }

    {
      V** d_vectors_ptr = nullptr;
      bool* d_found;
      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr,
                            stream);
      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,
                                   options.dim, KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
      CUDA_CHECK(cudaFree(d_found));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    thread_local bool print_unequal{false};
    thread_local uint64_t err_times{0};
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        for (int j = 0; j < options.dim; j++) {
          if (batch == 2) {
            if (h_vectors[i * options.dim + j] !=
                static_cast<float>(h_keys[i] * 0.00001)) {
              if (!print_unequal) {
                std::cout << " [Thread " << task_n << "]\t";
                UNEQUAL_EXPR(h_vectors[i * options.dim + j],
                             static_cast<float>(h_keys[i] * 0.00001));
                print_unequal = true;
              }
              err_times += 1;
            }
          } else {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      static_cast<float>(h_keys[i] * 0.00001));
          }
        }
      }
    }

    bool print_thread_id{false};
    if (batch == 0 || batch == 1) {
      ASSERT_EQ(found_num, KEY_NUM);
      ASSERT_EQ(err_times, 0);
    } else {
      if (found_num != KEY_NUM or err_times != 0) {
        std::cout << " [Thread " << task_n << "]\t"
                  << "Number of keys(insert/found/error) : " << "(" << KEY_NUM
                  << "/" << found_num << "/" << err_times << ") \t";
        print_thread_id = true;
      }
    }
    if (current_capacity != table->capacity() && !capacity_silent) {
      if (!print_thread_id) std::cout << " [Thread " << task_n << "]\t";

      std::cout << "The capacity changed from " << current_capacity << " to "
                << table->capacity() << std::endl;
    } else if (print_thread_id) {
      std::cout << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaFreeHost(h_keys));
    CUDA_CHECK(cudaFreeHost(h_found));
    CUDA_CHECK(cudaFreeHost(h_vectors));

    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_vectors));
    CUDA_CHECK(cudaFree(d_found));
    CUDA_CHECK(cudaDeviceSynchronize());

    CudaCheckError();
  };
  auto worker2 = [&table, KEY_NUM, options, capacity_silent](int batch,
                                                             int task_n) {
    K* h_keys;
    V* h_vectors;
    bool* h_found;

    size_t current_capacity = table->capacity();

    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

    K* d_keys;
    V* d_vectors;
    V* d_new_vectors;
    bool* d_found;

    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,
                                                KEY_NUM);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));

    {
      V** d_vectors_ptr = nullptr;
      bool* d_found;
      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr,
                            stream);
      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,
                                   options.dim, KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
      CUDA_CHECK(cudaFree(d_found));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
    table->assign(KEY_NUM, d_keys, d_new_vectors, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    thread_local bool print_unequal{false};
    thread_local uint64_t err_times{0};
    uint32_t i_value = 0x2020202;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        for (int j = 0; j < options.dim; j++) {
          if (batch == 2) {
            if (h_vectors[i * options.dim + j] !=
                *(reinterpret_cast<float*>(&i_value))) {
              if (!print_unequal) {
                std::cout << " [Thread " << task_n << "]\t";
                UNEQUAL_EXPR(h_vectors[i * options.dim + j],
                             *(reinterpret_cast<float*>(&i_value)));
                print_unequal = true;
              }
              err_times += 1;
            }
          } else {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      *(reinterpret_cast<float*>(&i_value)));
          }
        }
      }
    }

    bool print_thread_id{false};
    if (batch == 0 || batch == 1) {
      ASSERT_EQ(found_num, KEY_NUM);
      ASSERT_EQ(err_times, 0);
    } else {
      if (found_num != KEY_NUM or err_times != 0) {
        std::cout << " [Thread " << task_n << "]\t"
                  << "Number of keys(insert/found/error) : " << "(" << KEY_NUM
                  << "/" << found_num << "/" << err_times << ") \t";
        print_thread_id = true;
      }
    }
    if (current_capacity != table->capacity() && !capacity_silent) {
      if (!print_thread_id) std::cout << " [Thread " << task_n << "]\t";

      std::cout << "The capacity changed from " << current_capacity << " to "
                << table->capacity() << std::endl;
    } else if (print_thread_id) {
      std::cout << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaFreeHost(h_keys));
    CUDA_CHECK(cudaFreeHost(h_found));
    CUDA_CHECK(cudaFreeHost(h_vectors));

    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_vectors));
    CUDA_CHECK(cudaFree(d_new_vectors));
    CUDA_CHECK(cudaFree(d_found));
    CUDA_CHECK(cudaDeviceSynchronize());

    CudaCheckError();
  };

  /* the table is relative idle, and assume there is no eviction */
  int batch = 0;
  std::cout << "[Batch 0] " << BATCH_0_SIZE << " threads\n";
  for (int i = 0; i < BATCH_0_SIZE; i += 2) {
    threads.emplace_back(std::thread(worker1, batch, i));
    threads.emplace_back(std::thread(worker2, batch, i + 1));
  }
  for (auto& th : threads) {
    th.join();
  }
  threads.clear();

  /* test the correct of APIs serially */
  batch = 1;
  std::cout << "[Batch 1] " << BATCH_1_SIZE << " threads\n";
  for (int i = BATCH_0_SIZE; i < BATCH_0_SIZE + BATCH_1_SIZE; i += 2) {
    auto th = std::thread(worker1, batch, i);
    th.join();
    th = std::thread(worker2, batch, i + 1);
    th.join();
  }

  /* eviction may occur */
  batch = 2;
  std::cout << "[Batch 2] " << BATCH_2_SIZE << " threads\n";
  for (int i = BATCH_0_SIZE + BATCH_1_SIZE; i < THREAD_N; i += 2) {
    threads.emplace_back(std::thread(worker1, batch, i));
    threads.emplace_back(std::thread(worker2, batch, i + 1));
  }
  for (auto& th : threads) {
    th.join();
  }
  ASSERT_EQ(table->capacity(), MAX_CAPACITY);
}

template <typename K, typename V, typename S, typename Table, size_t dim = 64>
void CheckFindOrInsertValues(Table* table, K* keys, V* values, S* scores,
                             size_t len, cudaStream_t stream) {
  std::map<K, test_util::ValueArray<V, dim>> map_before_insert;
  std::map<K, test_util::ValueArray<V, dim>> map_after_insert;
  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;

  size_t table_size_before = table->size(stream);
  size_t cap = table_size_before + len;

  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));
  h_tmp_keys = (K*)malloc(cap * sizeof(K));
  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
  h_tmp_scores = (S*)malloc(cap * sizeof(S));

  size_t table_size_verify0 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);
  ASSERT_EQ(table_size_before, table_size_verify0);

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_before * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_before * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_before * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < table_size_verify0; i++) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    map_before_insert[h_tmp_keys[i]] = *vec;
  }

  auto start = std::chrono::steady_clock::now();

  {
    V** d_vectors_ptr = nullptr;
    bool* d_found;
    CUDA_CHECK(cudaMalloc(&d_found, len * sizeof(bool)));
    CUDA_CHECK(cudaMalloc(&d_vectors_ptr, len * sizeof(V*)));
    table->find_or_insert(len, keys, d_vectors_ptr, d_found, nullptr, stream);
    test_util::read_or_write_ptr(d_vectors_ptr, values, d_found, dim, len,
                                 stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaFree(d_vectors_ptr));
    CUDA_CHECK(cudaFree(d_found));
  }

  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::steady_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

  float dur = diff.count();

  size_t table_size_after = table->size(stream);
  size_t table_size_verify1 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

  ASSERT_EQ(table_size_verify1, table_size_after);

  size_t new_cap = table_size_after;
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_after * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_after * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_after * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int64_t new_cap_K = (int64_t)new_cap;
  for (int64_t i = new_cap_K - 1; i >= 0; i--) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    map_after_insert[h_tmp_keys[i]] = *vec;
  }

  size_t value_diff_cnt = 0;
  for (auto& it : map_after_insert) {
    test_util::ValueArray<V, dim>& vec = map_after_insert.at(it.first);
    for (size_t j = 0; j < dim; j++) {
      if (vec[j] != static_cast<float>(it.first * 0.00001)) {
        ++value_diff_cnt;
        break;
      }
    }
  }
  ASSERT_EQ(value_diff_cnt, 0);
  std::cout << "Check find_or_insert behavior got "
            << "value_diff_cnt: " << value_diff_cnt
            << ", while table_size_before: " << table_size_before
            << ", while table_size_after: " << table_size_after
            << ", while len: " << len << std::endl;

  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
  free(h_tmp_keys);
  free(h_tmp_values);
  free(h_tmp_scores);
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

void test_find_or_insert_values_check(size_t max_hbm_for_vectors) {
  const size_t U = 524288;
  const size_t init_capacity = 1024;
  const size_t B = 524288 + 13;
  constexpr size_t dim = 64;

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;
  opt.dim = 64;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<K, V, S> data_buffer;
  data_buffer.Reserve(B, dim, stream);

  size_t offset = 0;
  S score = 0;
  for (int i = 0; i < 20; i++) {
    test_util::create_random_keys<K, S, V, dim>(
        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),
        data_buffer.values_ptr(false), (int)B, B * 16);
    data_buffer.SyncData(true, stream);

    CheckFindOrInsertValues<K, V, S, Table, dim>(
        table.get(), data_buffer.keys_ptr(), data_buffer.values_ptr(),
        data_buffer.scores_ptr(), B, stream);

    offset += B;
    score += 1;
  }
}

void test_duplicated_keys(size_t max_hbm_for_vectors, size_t key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 1024UL;
  constexpr uint64_t TEST_TIMES = 3;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_keys, 1, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_new_vectors;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      V** d_vectors_ptr = nullptr;
      bool* d_found;
      CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));

      table->find_or_insert(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores,
                            stream, false);
      test_util::read_or_write_ptr(d_vectors_ptr, d_vectors, d_found,
                                   options.dim, KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
      CUDA_CHECK(cudaFree(d_found));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 1);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_new_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

TEST(FindOrInsertPtrTest, test_export_batch_if) {
  test_export_batch_if(16);
  test_export_batch_if(0, 33);
}
TEST(FindOrInsertPtrTest, test_find_or_insert_multi_threads) {
  test_find_or_insert_multi_threads(16, 0.25f, 0.125f);
  test_find_or_insert_multi_threads(16, 0.375f, 0.125f);
  test_find_or_insert_multi_threads(0, 0.25f, 0.125f);
  test_find_or_insert_multi_threads(0, 0.375f, 0.125f);
}
TEST(FindOrInsertPtrTest, test_basic) {
  test_basic(16, 3);
  test_basic(0);
}
TEST(FindOrInsertPtrTest, test_basic_when_full) {
  test_basic_when_full(16, 4);
  test_basic_when_full(0);
}
TEST(FindOrInsertPtrTest, test_erase_if_pred) {
  test_erase_if_pred(16);
  test_erase_if_pred(0, 18);
}
TEST(FindOrInsertPtrTest, test_rehash) {
  test_rehash(16);
  test_rehash(0, 44);
}
TEST(FindOrInsertPtrTest, test_rehash_on_big_batch) {
  test_rehash_on_big_batch(16, 23);
  test_rehash_on_big_batch(0);
}
TEST(FindOrInsertPtrTest, test_dynamic_rehash_on_multi_threads) {
  test_dynamic_rehash_on_multi_threads(16);
  test_dynamic_rehash_on_multi_threads(0, 19);
}
TEST(FindOrInsertPtrTest, test_basic_for_cpu_io) {
  test_basic_for_cpu_io();
  test_basic_for_cpu_io(52);
}
TEST(FindOrInsertPtrTest, test_evict_strategy_lru_basic) {
  test_evict_strategy_lru_basic(16);
  test_evict_strategy_lru_basic(0, 18);
}
TEST(FindOrInsertPtrTest, test_evict_strategy_lfu_basic) {
  test_evict_strategy_lfu_basic(16, 29);
  test_evict_strategy_lfu_basic(0);
}
TEST(FindOrInsertPtrTest, test_evict_strategy_epochlru_basic) {
  test_evict_strategy_epochlru_basic(16, 45);
  test_evict_strategy_epochlru_basic(0);
}
TEST(FindOrInsertPtrTest, test_evict_strategy_epochlfu_basic) {
  test_evict_strategy_epochlfu_basic(16);
  test_evict_strategy_epochlfu_basic(0, 59);
}
TEST(FindOrInsertPtrTest, test_evict_strategy_customized_basic) {
  test_evict_strategy_customized_basic(16, 38);
  test_evict_strategy_customized_basic(0);
}
TEST(FindOrInsertPtrTest, test_evict_strategy_customized_advanced) {
  test_evict_strategy_customized_advanced(16);
  test_evict_strategy_customized_advanced(0, 25);
}
TEST(FindOrInsertPtrTest, test_evict_strategy_customized_correct_rate) {
  // TODO(rhdong): after blossom CI issue is resolved, the skip logic.
  const bool skip_hmem_check = (nullptr != std::getenv("IS_BLOSSOM_CI"));
  test_evict_strategy_customized_correct_rate(16, 16);
  if (!skip_hmem_check) {
    test_evict_strategy_customized_correct_rate(0);
  } else {
    std::cout << "The HMEM check is skipped in blossom CI!" << std::endl;
  }
}
TEST(FindOrInsertPtrTest, test_find_or_insert_values_check) {
  test_find_or_insert_values_check(16);
  // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.
  test_find_or_insert_values_check(0);
}
TEST(FindOrInsertPtrTest, test_duplicated_keys) {
  test_duplicated_keys(16, 39);
  test_duplicated_keys(0);
}


================================================
FILE: tests/find_or_insert_test.cc.cu
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * test APIs: find_or_insert and assign,
 * move insert operation from `insert_or_assign` to `find`.
 */

#include <gtest/gtest.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <array>
#include <iostream>
#include <thread>
#include <vector>
#include "merlin_hashtable.cuh"
#include "test_util.cuh"

constexpr size_t DIM = 16;
using K = uint64_t;
using V = float;
using S = uint64_t;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;

template <class K, class S>
struct EraseIfPredFunctor {
  __forceinline__ __device__ bool operator()(const K& key, S& score,
                                             const K& pattern,
                                             const S& threshold) {
    return ((key & 0x7f > pattern) && (score > threshold));
  }
};

template <class K, class S>
struct ExportIfPredFunctor {
  __forceinline__ __device__ bool operator()(const K& key, S& score,
                                             const K& pattern,
                                             const S& threshold) {
    return score > threshold;
  }
};

void test_basic(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_new_vectors;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }

    int found_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));
    table->assign(KEY_NUM, reinterpret_cast<const K*>(d_keys),
                  reinterpret_cast<const float*>(d_new_vectors),
                  reinterpret_cast<const S*>(d_scores), stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(d_new_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      test_util::array2ptr(d_vectors_ptr, d_new_vectors, options.dim, KEY_NUM,
                           stream);
      table->find(KEY_NUM, d_keys, d_new_vectors, d_found, d_scores, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_new_vectors, options.dim,
                               KEY_NUM, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_new_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    found_num = 0;
    uint32_t i_value = 0x2020202;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  *(reinterpret_cast<float*>(&i_value)));
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,
                           stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    table->erase(KEY_NUM >> 1, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, total_size >> 1);

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    found_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_keys, 0, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                       d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    ASSERT_EQ(dump_counter, KEY_NUM);
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_new_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_basic_when_full(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = INIT_CAPACITY;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_def_val;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_insert = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemset(d_def_val, 0, KEY_NUM * sizeof(V) * options.dim));
    table->find(KEY_NUM, d_keys, d_def_val, d_found, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_def_val,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(total_size_after_insert, found_num);

    table->erase(KEY_NUM, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_reinsert = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_insert, total_size_after_reinsert);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_def_val));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_erase_if_pred(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 256UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 128UL;
  constexpr uint64_t TEST_TIMES = 1;
  constexpr uint64_t BUCKET_MAX_SIZE = 128;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    test_util::create_keys_in_one_buckets<K, S, V, DIM>(
        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

    K pattern = 100;
    S threshold = 0;
    size_t erase_num = table->template erase_if<EraseIfPredFunctor>(
        pattern, threshold, stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ((erase_num + total_size), BUCKET_MAX_SIZE);

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        ASSERT_EQ(h_scores[i], h_keys[i]);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, (BUCKET_MAX_SIZE - erase_num));

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_rehash(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
  constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE;
  constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = BUCKET_MAX_SIZE * 2;
  constexpr uint64_t TEST_TIMES = 100;
  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    test_util::create_keys_in_one_buckets<K, S, V, DIM>(
        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaDeviceSynchronize());
    ASSERT_EQ(total_size, KEY_NUM);

    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                       d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(dump_counter, KEY_NUM);

    table->reserve(MAX_CAPACITY, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(table->capacity(), MAX_CAPACITY);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, BUCKET_MAX_SIZE * sizeof(V*)));
      table->find(BUCKET_MAX_SIZE, d_keys, d_vectors_ptr, d_found, d_scores,
                  stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim,
                               BUCKET_MAX_SIZE, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {
      if (h_found[i]) {
        found_num++;
        ASSERT_EQ(h_scores[i], h_keys[i]);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, BUCKET_MAX_SIZE);

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_rehash_on_big_batch(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 1024;
  constexpr uint64_t MAX_CAPACITY = 16 * 1024;
  constexpr uint64_t INIT_KEY_NUM = 1024;
  constexpr uint64_t KEY_NUM = 2048;
  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = 128;
  options.max_load_factor = 0.6;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  uint64_t expected_size = 0;
  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  total_size = table->size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(total_size, 0);

  table->find_or_insert(INIT_KEY_NUM, d_keys, d_vectors, d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  expected_size = INIT_KEY_NUM;

  total_size = table->size(stream);
  CUDA_CHECK(cudaDeviceSynchronize());
  ASSERT_EQ(total_size, expected_size);
  ASSERT_EQ(table->capacity(), (INIT_CAPACITY * 2));

  table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  expected_size = KEY_NUM;

  total_size = table->size(stream);
  CUDA_CHECK(cudaDeviceSynchronize());
  ASSERT_EQ(total_size, expected_size);
  ASSERT_EQ(table->capacity(), KEY_NUM * 4);

  dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                     d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(dump_counter, expected_size);

  CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
  {
    V** d_vectors_ptr = nullptr;
    CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
    table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, d_scores, stream);
    test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                             stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaFree(d_vectors_ptr));
  }
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int found_num = 0;

  CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(
      cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K), cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                        cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));
  for (int i = 0; i < KEY_NUM; i++) {
    if (h_found[i]) {
      found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
  }
  ASSERT_EQ(found_num, KEY_NUM);

  table->clear(stream);
  total_size = table->size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(total_size, 0);
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors,
                                          int key_start = 0) {
  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
  constexpr uint64_t INIT_CAPACITY = 4 * 1024;
  constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 256;
  constexpr uint64_t THREAD_N = 8;

  std::vector<std::thread> threads;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_load_factor = 0.50f;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::shared_ptr<Table> table = std::make_shared<Table>();
  table->init(options);

  auto worker_function = [&table, KEY_NUM, options](int task_n) {
    K* h_keys;
    V* h_vectors;
    bool* h_found;

    size_t current_capacity = table->capacity();

    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

    K* d_keys;
    V* d_vectors;
    bool* d_found;

    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    while (table->capacity() < MAX_CAPACITY) {
      test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,
                                                  KEY_NUM);
      CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

      table->find_or_insert(KEY_NUM, d_keys, d_vectors, nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      {
        V** d_vectors_ptr = nullptr;
        CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
        table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
        test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                                 stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
        CUDA_CHECK(cudaFree(d_vectors_ptr));
      }

      CUDA_CHECK(cudaStreamSynchronize(stream));
      int found_num = 0;

      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));

      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) {
          found_num++;
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      static_cast<float>(h_keys[i] * 0.00001));
          }
        }
      }
      ASSERT_EQ(found_num, KEY_NUM);
      if (task_n == 0 && current_capacity != table->capacity()) {
        std::cout << "[test_dynamic_rehash_on_multi_threads] The capacity "
                     "changed from "
                  << current_capacity << " to " << table->capacity()
                  << std::endl;
        current_capacity = table->capacity();
      }
      CUDA_CHECK(cudaStreamSynchronize(stream));
    }
    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaFreeHost(h_keys));
    CUDA_CHECK(cudaFreeHost(h_found));
    CUDA_CHECK(cudaFreeHost(h_vectors));

    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_vectors));
    CUDA_CHECK(cudaFree(d_found));
    CUDA_CHECK(cudaDeviceSynchronize());

    CudaCheckError();
  };

  for (int i = 0; i < THREAD_N; ++i)
    threads.emplace_back(std::thread(worker_function, i));

  for (auto& th : threads) {
    th.join();
  }
  ASSERT_EQ(table->capacity(), MAX_CAPACITY);
}

void test_export_batch_if(size_t max_hbm_for_vectors, int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 256UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 128UL;
  constexpr uint64_t TEST_TIMES = 1;
  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  size_t h_dump_counter = 0;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  size_t* d_dump_counter;
  int found_num = 0;
  bool* h_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));

  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;

  S threshold = test_util::host_nano<S>();
  for (int i = 0; i < TEST_TIMES; i++) {
    test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                                KEY_NUM);

    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    found_num = 0;
    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {
      if (h_found[i]) {
        found_num++;
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    K pattern = 100;

    table->template export_batch_if<ExportIfPredFunctor>(
        pattern, threshold, table->capacity(), 0, d_dump_counter, d_keys,
        d_vectors, d_scores, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));

    size_t expected_export_count = 0;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_scores[i] > threshold) expected_export_count++;
    }
    ASSERT_EQ(expected_export_count, h_dump_counter);

    table->template export_batch_if<ExportIfPredFunctor>(
        pattern, test_util::host_nano<S>(), table->capacity(), 0,
        d_dump_counter, d_keys, d_vectors, d_scores, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),
                          cudaMemcpyDeviceToHost));
    ASSERT_EQ(0, h_dump_counter);

    CUDA_CHECK(cudaMemset(h_keys, 0, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < h_dump_counter; i++) {
      ASSERT_GT(h_scores[i], threshold);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaFree(d_dump_counter));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_basic_for_cpu_io(int key_start = 0) {
  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(0);
  options.io_by_cpu = true;
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, nullptr,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_def_val;
  V** d_vectors_ptr;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_vectors, 2, KEY_NUM * sizeof(V) * options.dim));
    table->assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
    }
    ASSERT_EQ(found_num, KEY_NUM);

    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,
                           stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    table->erase(KEY_NUM >> 1, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, total_size >> 1);

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);

    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                       d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(dump_counter, KEY_NUM);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_def_val));
  CUDA_CHECK(cudaFree(d_vectors_ptr));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors,
                                   int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      S start_ts = test_util::host_nano<S>(stream);
      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,
                            stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts = test_util::host_nano<S>(stream);

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      ASSERT_GE(h_scores_temp_sorted[0], start_ts);
      ASSERT_LE(h_scores_temp_sorted[TEST_KEY_NUM - 1], end_ts);
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      S start_ts = test_util::host_nano<S>(stream);
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr, stream);
      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,
                            stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts = test_util::host_nano<S>(stream);

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::array<S, TEMP_KEY_NUM> h_scores_temp_sorted;
      int ctr = 0;
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {
          ASSERT_GT(h_scores_temp[i], BUCKET_MAX_SIZE);
          h_scores_temp_sorted[ctr++] = h_scores_temp[i];
        } else {
          ASSERT_LE(h_scores_temp[i], start_ts);
        }
      }
      std::sort(h_scores_temp_sorted.begin(),
                h_scores_temp_sorted.begin() + ctr);

      ASSERT_GE(h_scores_temp_sorted[0], start_ts);
      ASSERT_LE(h_scores_temp_sorted[ctr - 1], end_ts);
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors,
                                   int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  int freq_range = 1000;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,
      freq_range);

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD, freq_range);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  h_scores_test[2] = h_keys_base[72] % freq_range;
  h_scores_test[3] = h_keys_base[73] % freq_range;

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        ASSERT_EQ(h_scores_temp[i], h_keys_temp[i] % freq_range);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,
                    stream);
      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        bool in_base =
            h_keys_base.end() !=
            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);
        bool in_test =
            h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);
        if (in_base && in_test) {
          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range) * 3);
        } else {
          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range));
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors,
                                        int key_start = 0) {
  constexpr int RSHIFT_ON_NANO = 20;

  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      S start_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
      table->set_global_epoch(global_epoch);
      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,
                            stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));
      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1],
                (global_epoch << 32 | end_ts));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      S start_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
      table->set_global_epoch(global_epoch);
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr, stream);
      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, nullptr,
                            stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted;
      h_scores_temp_sorted.reserve(TEMP_KEY_NUM);
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {
          ASSERT_GE(h_scores_temp[i], (global_epoch << 32 | start_ts));
          h_scores_temp_sorted.push_back(h_scores_temp[i]);
        } else {
          ASSERT_LE(h_scores_temp[i], (global_epoch << 32 | start_ts));
        }
      }
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      if (!h_scores_temp_sorted.empty()) {
        ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));
        ASSERT_LE(h_scores_temp_sorted[h_scores_temp_sorted.size() - 1],
                  (global_epoch << 32 | end_ts));
      }
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors,
                                        int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  int freq_range = 1000;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,
      freq_range);

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD, freq_range);

  // Simulate overflow of low 32bits.
  h_scores_base[71] = static_cast<S>(std::numeric_limits<uint32_t>::max() -
                                     static_cast<uint32_t>(1));

  h_keys_test[1] = h_keys_base[71];
  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  h_scores_test[1] = h_scores_base[71];
  h_scores_test[2] = h_keys_base[72] % freq_range;
  h_scores_test[3] = h_keys_base[73] % freq_range;

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[1 * options.dim + i] = h_vectors_base[71 * options.dim + i];
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->set_global_epoch(global_epoch);
      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        if (h_keys_temp[i] == h_keys_base[71]) {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, h_scores_base[71]);
          ASSERT_EQ(h_scores_temp[i], expected_score);
        } else {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, (h_keys_temp[i] % freq_range));
          ASSERT_EQ(h_scores_temp[i], expected_score);
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->set_global_epoch(global_epoch);
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,
                    stream);
      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      ASSERT_TRUE(h_keys_temp.end() != std::find(h_keys_temp.begin(),
                                                 h_keys_temp.end(),
                                                 h_keys_base[71]));

      for (int i = 0; i < dump_counter; i++) {
        bool in_base =
            h_keys_base.end() !=
            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);
        bool in_test =
            h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);

        if (in_base && in_test) {
          if (h_keys_temp[i] == h_keys_base[71]) {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch, h_scores_base[71] * 2);
            ASSERT_EQ(h_scores_temp[i], expected_score);
          } else {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch,
                (h_keys_temp[i] % freq_range) * 3);  // update score when found.
            ASSERT_EQ(h_scores_temp[i], expected_score);
          }
        } else {
          if (h_keys_temp[i] == h_keys_base[71]) {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch - static_cast<S>(in_base), h_scores_base[71]);
            ASSERT_EQ(h_scores_temp[i], expected_score);
          } else {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch - static_cast<S>(in_base),
                (h_keys_temp[i] % freq_range));

            ASSERT_EQ(h_scores_temp[i], expected_score);
          }
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors,
                                          int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 128;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  const S base_score_start = 1000;
  for (int i = 0; i < BASE_KEY_NUM; i++) {
    h_scores_base[i] = base_score_start + i;
  }

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);
  const S test_score_start = base_score_start + BASE_KEY_NUM;
  for (int i = 0; i < TEST_KEY_NUM; i++) {
    h_scores_test[i] = test_score_start + i;
  }
  for (int i = 64; i < TEST_KEY_NUM; i++) {
    h_keys_test[i] = h_keys_base[i];
    for (int j = 0; j < options.dim; j++) {
      h_vectors_test[i * options.dim + j] = h_vectors_base[i * options.dim + j];
    }
  }

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,
                    stream);
      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range_test =
          test_util::range<S, TEST_KEY_NUM>(test_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range_test.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors,
                                             int key_start = 0) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 8;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 256;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  const S base_score_start = 1000;
  for (int i = 0; i < BASE_KEY_NUM; i++) {
    h_scores_base[i] = base_score_start + i;
  }

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[4] = h_keys_base[72];
  h_keys_test[5] = h_keys_base[73];
  h_keys_test[6] = h_keys_base[74];
  h_keys_test[7] = h_keys_base[75];

  // replace four new keys to lower scores, would not be inserted.
  h_scores_test[0] = 20;
  h_scores_test[1] = 78;
  h_scores_test[2] = 97;
  h_scores_test[3] = 98;

  // replace three exist keys to new scores, just refresh the score for them.
  h_scores_test[4] = 99;
  h_scores_test[5] = 1010;
  h_scores_test[6] = 1020;
  h_scores_test[7] = 1035;

  for (int i = 4; i < TEST_KEY_NUM; i++) {
    for (int j = 0; j < options.dim; j++) {
      h_vectors_test[i * options.dim + j] =
          static_cast<V>(h_keys_test[i] * 0.00001);
    }
  }

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->find_or_insert(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp, d_scores_temp,
                    stream);
      table->find_or_insert(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                            d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < TEST_KEY_NUM; i++) {
        if (i < 4) {
          ASSERT_EQ(h_keys_temp.end(),
                    std::find(h_keys_temp.begin(), h_keys_temp.end(),
                              h_keys_test[i]));
        } else {
          ASSERT_NE(h_keys_temp.end(),
                    std::find(h_keys_temp.begin(), h_keys_temp.end(),
                              h_keys_test[i]));
        }
      }
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_temp[i] == h_keys_test[4])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[4]);
        if (h_keys_temp[i] == h_keys_test[5])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[5]);
        if (h_keys_temp[i] == h_keys_test[6])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[6]);
        if (h_keys_temp[i] == h_keys_test[7])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[7]);

        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

template <typename K, typename V, typename S, typename Table, size_t dim = 64>
void CheckAssignOnEpochLfu(Table* table,
                           test_util::KVMSBuffer<K, V, S>* data_buffer,
                           test_util::KVMSBuffer<K, V, S>* evict_buffer,
                           test_util::KVMSBuffer<K, V, S>* pre_data_buffer,
                           size_t len, cudaStream_t stream, TableOptions& opt,
                           unsigned int global_epoch) {
  std::map<K, test_util::ValueArray<V, dim>> values_map_before_insert;
  std::map<K, test_util::ValueArray<V, dim>> values_map_after_insert;

  std::unordered_map<K, S> scores_map_before_insert;
  std::map<K, S> scores_map_after_insert;

  std::map<K, S> scores_map_current_batch;
  std::map<K, S> scores_map_current_evict;

  K* keys = data_buffer->keys_ptr();
  V* values = data_buffer->values_ptr();
  S* scores = data_buffer->scores_ptr();

  K* evicted_keys = evict_buffer->keys_ptr();
  V* evicted_values = evict_buffer->values_ptr();
  S* evicted_scores = evict_buffer->scores_ptr();

  for (size_t i = 0; i < len; i++) {
    scores_map_current_batch[data_buffer->keys_ptr(false)[i]] =
        data_buffer->scores_ptr(false)[i];
  }

  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;
  bool* h_tmp_founds = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;
  bool* d_tmp_founds = nullptr;

  size_t table_size_before = table->size(stream);
  size_t cap = table_size_before + len;

  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));
  h_tmp_keys = (K*)malloc(cap * sizeof(K));
  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
  h_tmp_scores = (S*)malloc(cap * sizeof(S));
  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));

  size_t table_size_verify0 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);
  ASSERT_EQ(table_size_before, table_size_verify0);

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_before * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_before * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_before * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys,
                             len * sizeof(K), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim, values,
                             len * dim * sizeof(V), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before, scores,
                             len * sizeof(S), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < cap; i++) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    values_map_before_insert[h_tmp_keys[i]] = *vec;
  }

  for (size_t i = 0; i < table_size_before; i++) {
    scores_map_before_insert[h_tmp_keys[i]] = h_tmp_scores[i];
  }

  table->set_global_epoch(global_epoch);
  table->assign(len, keys, values, scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  {
    size_t table_size_verify1 = table->export_batch(
        table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

    CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                               table_size_before * sizeof(K),
                               cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                               table_size_before * dim * sizeof(V),
                               cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                               table_size_before * sizeof(S),
                               cudaMemcpyDeviceToHost, stream));

    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(table_size_verify1, table_size_before);

    size_t score_error_cnt = 0;

    for (int64_t i = table_size_before - 1; i >= 0; i--) {
      test_util::ValueArray<V, dim>* vec =
          reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                           i * dim);
      values_map_after_insert[h_tmp_keys[i]] = *vec;
      scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];
    }

    for (auto it : scores_map_current_batch) {
      const K key = it.first;
      const K score = it.second;
      S current_score = scores_map_after_insert[key];
      S score_before_insert = 0;
      if (scores_map_before_insert.find(key) !=
          scores_map_before_insert.end()) {
        score_before_insert = scores_map_before_insert[key];
        bool valid =
            ((current_score >> 32) == global_epoch) &&
            ((current_score & 0xFFFFFFFF) ==
             ((0xFFFFFFFF & score_before_insert) + (0xFFFFFFFF & score)));

        if (!valid) {
          score_error_cnt++;
        }
      }
    }
    std::cout << "Check assign behavior got "
              << ", score_error_cnt: " << score_error_cnt
              << ", while len: " << len << std::endl;
    ASSERT_EQ(score_error_cnt, 0);
  }

  for (int64_t i = 0; i < table_size_before; i++) {
    values_map_before_insert[h_tmp_keys[i]] =
        values_map_after_insert[h_tmp_keys[i]];
    scores_map_before_insert[h_tmp_keys[i]] =
        scores_map_after_insert[h_tmp_keys[i]];
  }
  values_map_after_insert.clear();
  scores_map_after_insert.clear();

  auto start = std::chrono::steady_clock::now();
  size_t filtered_len = table->insert_and_evict(
      len, keys, values,
      (Table::evict_strategy == EvictStrategy::kLru ||
       Table::evict_strategy == EvictStrategy::kEpochLru)
          ? nullptr
          : scores,
      evicted_keys, evicted_values, evicted_scores, stream);
  evict_buffer->SyncData(false, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::steady_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

  for (size_t i = 0; i < filtered_len; i++) {
    scores_map_current_evict[evict_buffer->keys_ptr(false)[i]] =
        evict_buffer->scores_ptr(false)[i];
  }

  float dur = diff.count();

  size_t table_size_after = table->size(stream);
  size_t table_size_verify1 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

  ASSERT_EQ(table_size_verify1, table_size_after);

  size_t new_cap = table_size_after + filtered_len;
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_after * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_after * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_after * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,
                             filtered_len * sizeof(K), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,
                             evicted_values, filtered_len * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after, evicted_scores,
                             filtered_len * sizeof(S), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  size_t key_miss_cnt = 0;
  size_t value_diff_cnt = 0;
  size_t score_error_cnt1 = 0;
  size_t score_error_cnt2 = 0;

  for (int64_t i = new_cap - 1; i >= 0; i--) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    values_map_after_insert[h_tmp_keys[i]] = *vec;
    scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];
    if (i >= (new_cap - filtered_len)) {
      bool valid = ((h_tmp_scores[i] >> 32) < (global_epoch - 2));
      if (!valid) {
        score_error_cnt1++;
      }
    }
  }

  for (auto it : scores_map_current_batch) {
    const K key = it.first;
    const K score = it.second;
    S current_score = scores_map_after_insert[key];
    S score_before_insert = 0;
    if (values_map_after_insert.find(key) != values_map_after_insert.end() &&
        scores_map_current_evict.find(key) == scores_map_current_evict.end()) {
      score_before_insert = scores_map_before_insert[key];
    }
    bool valid = ((current_score >> 32) == global_epoch) &&
                 ((current_score & 0xFFFFFFFF) ==
                  ((0xFFFFFFFF & score_before_insert) + (0xFFFFFFFF & score)));

    if (!valid) {
      score_error_cnt2++;
    }
  }

  for (auto& it : values_map_before_insert) {
    if (values_map_after_insert.find(it.first) ==
        values_map_after_insert.end()) {
      ++key_miss_cnt;
      continue;
    }
    test_util::ValueArray<V, dim>& vec0 = it.second;
    test_util::ValueArray<V, dim>& vec1 = values_map_after_insert.at(it.first);
    for (size_t j = 0; j < dim; j++) {
      if (vec0[j] != vec1[j]) {
        ++value_diff_cnt;
        break;
      }
    }
  }

  std::cout << "Check insert_and_evict behavior got "
            << "key_miss_cnt: " << key_miss_cnt
            << ", value_diff_cnt: " << value_diff_cnt
            << ", score_error_cnt1: " << score_error_cnt1
            << ", score_error_cnt2: " << score_error_cnt2
            << ", while table_size_before: " << table_size_before
            << ", while table_size_after: " << table_size_after
            << ", while len: " << len << std::endl;

  ASSERT_EQ(key_miss_cnt, 0);
  ASSERT_EQ(value_diff_cnt, 0);
  ASSERT_EQ(score_error_cnt1, 0);
  ASSERT_EQ(score_error_cnt2, 0);

  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));
  free(h_tmp_keys);
  free(h_tmp_values);
  free(h_tmp_scores);
  free(h_tmp_founds);
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

void test_assign_advanced_on_epochlfu(size_t max_hbm_for_vectors) {
  const size_t U = 1024 * 1024;
  const size_t B = 100000;
  constexpr size_t dim = 16;

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = U;
  opt.max_hbm_for_vectors = U * dim * sizeof(V);
  opt.max_bucket_size = 128;
  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;
  opt.dim = dim;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<K, V, S> evict_buffer;
  evict_buffer.Reserve(B, dim, stream);
  evict_buffer.ToZeros(stream);

  test_util::KVMSBuffer<K, V, S> data_buffer;
  test_util::KVMSBuffer<K, V, S> pre_data_buffer;
  data_buffer.Reserve(B, dim, stream);
  pre_data_buffer.Reserve(B, dim, stream);

  size_t offset = 0;
  int freq_range = 100;
  float repeat_rate = 0.9;
  for (unsigned int global_epoch = 1; global_epoch <= 20; global_epoch++) {
    repeat_rate = global_epoch <= 1 ? 0.0 : 0.1;
    if (global_epoch <= 1) {
      test_util::create_random_keys_advanced<K, S, V>(
          dim, data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),
          data_buffer.values_ptr(false), (int)B, B * 32, freq_range);
    } else {
      test_util::create_random_keys_advanced<K, S, V>(
          dim, data_buffer.keys_ptr(false), pre_data_buffer.keys_ptr(false),
          data_buffer.scores_ptr(false), data_buffer.values_ptr(false), (int)B,
          B * 32, freq_range, repeat_rate);
    }
    data_buffer.SyncData(true, stream);
    if (global_epoch <= 1) {
      pre_data_buffer.CopyFrom(data_buffer, stream);
    }

    CheckAssignOnEpochLfu<K, V, S, Table, dim>(table.get(), &data_buffer,
                                               &evict_buffer, &pre_data_buffer,
                                               B, stream, opt, global_epoch);

    pre_data_buffer.CopyFrom(data_buffer, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    offset += B;
  }
}

void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) {
  constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;
  constexpr uint64_t STEPS = 128;
  constexpr uint64_t MAX_BUCKET_SIZE = 128;
  constexpr uint64_t INIT_CAPACITY = BATCH_SIZE * STEPS;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t TEST_TIMES = 1;
  float expected_correct_rate = 0.964;
  const int rounds = 12;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = MAX_BUCKET_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  K* h_keys_base = test_util::HostBuffer<K>(BATCH_SIZE).ptr();
  S* h_scores_base = test_util::HostBuffer<S>(BATCH_SIZE).ptr();
  V* h_vectors_base = test_util::HostBuffer<V>(BATCH_SIZE * options.dim).ptr();

  K* h_keys_temp = test_util::HostBuffer<K>(MAX_CAPACITY).ptr();
  S* h_scores_temp = test_util::HostBuffer<S>(MAX_CAPACITY).ptr();
  V* h_vectors_temp =
      test_util::HostBuffer<V>(MAX_CAPACITY * options.dim).ptr();

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, MAX_CAPACITY * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, MAX_CAPACITY * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, MAX_CAPACITY * sizeof(V) * options.dim));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t global_start_key = 100000;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    size_t start_key = global_start_key;

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    for (int r = 0; r < rounds; r++) {
      size_t expected_min_key = global_start_key + INIT_CAPACITY * r;
      size_t expected_max_key = global_start_key + INIT_CAPACITY * (r + 1) - 1;
      size_t expected_table_size =
          (r == 0) ? size_t(expected_correct_rate * INIT_CAPACITY)
                   : INIT_CAPACITY;

      for (int s = 0; s < STEPS; s++) {
        test_util::create_continuous_keys<K, S, V, DIM>(
            h_keys_base, h_scores_base, h_vectors_base, BATCH_SIZE, start_key);
        start_key += BATCH_SIZE;

        CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base, BATCH_SIZE * sizeof(K),
                              cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base,
                              BATCH_SIZE * sizeof(S), cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base,
                              BATCH_SIZE * sizeof(V) * options.dim,
                              cudaMemcpyHostToDevice));
        table->assign(BATCH_SIZE, d_keys_temp, d_vectors_temp, d_scores_temp,
                      stream);
        table->find_or_insert(BATCH_SIZE, d_keys_temp, d_vectors_temp,
                              d_scores_temp, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
      }

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_GE(total_size, expected_table_size);
      ASSERT_EQ(MAX_CAPACITY, table->capacity());

      size_t dump_counter = table->export_batch(
          MAX_CAPACITY, 0, d_keys_temp, d_vectors_temp, d_scores_temp, stream);

      CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys_temp, MAX_CAPACITY * sizeof(K),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_scores_temp, d_scores_temp,
                            MAX_CAPACITY * sizeof(S), cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors_temp,
                            MAX_CAPACITY * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));

      size_t bigger_score_counter = 0;
      K max_key = 0;
      size_t values_error_counter = 0;
      for (int i = 0; i < dump_counter; i++) {
        ASSERT_EQ(h_keys_temp[i], h_scores_temp[i]);
        max_key = std::max(max_key, h_keys_temp[i]);
        if (h_scores_temp[i] >= expected_min_key) bigger_score_counter++;
        for (int j = 0; j < options.dim; j++) {
          if (h_vectors_temp[i * options.dim + j] !=
              static_cast<float>(h_keys_temp[i] * 0.00001)) {
            values_error_counter++;
          }
        }
      }

      ASSERT_EQ(values_error_counter, 0);
      float correct_rate = (bigger_score_counter * 1.0) / MAX_CAPACITY;
      std::cout << std::setprecision(3) << "[Round " << r << "]"
                << "correct_rate=" << correct_rate << std::endl;
      ASSERT_GE(max_key, expected_max_key);
      ASSERT_GE(correct_rate, expected_correct_rate);
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_find_or_insert_multi_threads(size_t max_hbm_for_vectors,
                                       const float BATCH_0_RATIO,
                                       const float BATCH_1_RATIO,
                                       bool capacity_silent = true) {
  constexpr uint64_t THREAD_N = 64UL;
  const uint64_t BATCH_0_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_0_RATIO);
  const uint64_t BATCH_1_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_1_RATIO);
  const uint64_t BATCH_2_SIZE = THREAD_N - BATCH_0_SIZE - BATCH_1_SIZE;

  constexpr uint64_t INIT_CAPACITY = 32 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = 128 * 1024 * 1024UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;

  std::vector<std::thread> threads;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_load_factor = 0.50f;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::shared_ptr<Table> table = std::make_shared<Table>();
  table->init(options);
  // assert every key is different
  auto worker1 = [&table, KEY_NUM, options, capacity_silent](int batch,
                                                             int task_n) {
    K* h_keys;
    V* h_vectors;
    bool* h_found;

    size_t current_capacity = table->capacity();

    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

    K* d_keys;
    V* d_vectors;
    bool* d_found;

    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,
                                                KEY_NUM);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    table->assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));
    {
      int found_num = 0;
      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) {
          found_num++;
        }
      }
      ASSERT_EQ(found_num, 0);
    }

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    thread_local bool print_unequal{false};
    thread_local uint64_t err_times{0};
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        for (int j = 0; j < options.dim; j++) {
          if (batch == 2) {
            if (h_vectors[i * options.dim + j] !=
                static_cast<float>(h_keys[i] * 0.00001)) {
              if (!print_unequal) {
                std::cout << " [Thread " << task_n << "]\t";
                UNEQUAL_EXPR(h_vectors[i * options.dim + j],
                             static_cast<float>(h_keys[i] * 0.00001));
                print_unequal = true;
              }
              err_times += 1;
            }
          } else {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      static_cast<float>(h_keys[i] * 0.00001));
          }
        }
      }
    }

    bool print_thread_id{false};
    if (batch == 0 || batch == 1) {
      ASSERT_EQ(found_num, KEY_NUM);
      ASSERT_EQ(err_times, 0);
    } else {
      if (found_num != KEY_NUM or err_times != 0) {
        std::cout << " [Thread " << task_n << "]\t"
                  << "Number of keys(insert/found/error) : " << "(" << KEY_NUM
                  << "/" << found_num << "/" << err_times << ") \t";
        print_thread_id = true;
      }
    }
    if (current_capacity != table->capacity() && !capacity_silent) {
      if (!print_thread_id) std::cout << " [Thread " << task_n << "]\t";

      std::cout << "The capacity changed from " << current_capacity << " to "
                << table->capacity() << std::endl;
    } else if (print_thread_id) {
      std::cout << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaFreeHost(h_keys));
    CUDA_CHECK(cudaFreeHost(h_found));
    CUDA_CHECK(cudaFreeHost(h_vectors));

    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_vectors));
    CUDA_CHECK(cudaFree(d_found));
    CUDA_CHECK(cudaDeviceSynchronize());

    CudaCheckError();
  };
  auto worker2 = [&table, KEY_NUM, options, capacity_silent](int batch,
                                                             int task_n) {
    K* h_keys;
    V* h_vectors;
    bool* h_found;

    size_t current_capacity = table->capacity();

    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

    K* d_keys;
    V* d_vectors;
    V* d_new_vectors;
    bool* d_found;

    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,
                                                KEY_NUM);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));

    table->find_or_insert(KEY_NUM, d_keys, d_vectors, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    table->assign(KEY_NUM, d_keys, d_new_vectors, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    {
      V** d_vectors_ptr = nullptr;
      CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
      table->find(KEY_NUM, d_keys, d_vectors_ptr, d_found, nullptr, stream);
      test_util::read_from_ptr(d_vectors_ptr, d_vectors, options.dim, KEY_NUM,
                               stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaFree(d_vectors_ptr));
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    thread_local bool print_unequal{false};
    thread_local uint64_t err_times{0};
    uint32_t i_value = 0x2020202;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        for (int j = 0; j < options.dim; j++) {
          if (batch == 2) {
            if (h_vectors[i * options.dim + j] !=
                *(reinterpret_cast<float*>(&i_value))) {
              if (!print_unequal) {
                std::cout << " [Thread " << task_n << "]\t";
                UNEQUAL_EXPR(h_vectors[i * options.dim + j],
                             *(reinterpret_cast<float*>(&i_value)));
                print_unequal = true;
              }
              err_times += 1;
            }
          } else {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      *(reinterpret_cast<float*>(&i_value)));
          }
        }
      }
    }

    bool print_thread_id{false};
    if (batch == 0 || batch == 1) {
      ASSERT_EQ(found_num, KEY_NUM);
      ASSERT_EQ(err_times, 0);
    } else {
      if (found_num != KEY_NUM or err_times != 0) {
        std::cout << " [Thread " << task_n << "]\t"
                  << "Number of keys(insert/found/error) : " << "(" << KEY_NUM
                  << "/" << found_num << "/" << err_times << ") \t";
        print_thread_id = true;
      }
    }
    if (current_capacity != table->capacity() && !capacity_silent) {
      if (!print_thread_id) std::cout << " [Thread " << task_n << "]\t";

      std::cout << "The capacity changed from " << current_capacity << " to "
                << table->capacity() << std::endl;
    } else if (print_thread_id) {
      std::cout << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaFreeHost(h_keys));
    CUDA_CHECK(cudaFreeHost(h_found));
    CUDA_CHECK(cudaFreeHost(h_vectors));

    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_vectors));
    CUDA_CHECK(cudaFree(d_new_vectors));
    CUDA_CHECK(cudaFree(d_found));
    CUDA_CHECK(cudaDeviceSynchronize());

    CudaCheckError();
  };

  /* the table is relative idle, and assume there is no eviction */
  int batch = 0;
  std::cout << "[Batch 0] " << BATCH_0_SIZE << " threads\n";
  for (int i = 0; i < BATCH_0_SIZE; i += 2) {
    threads.emplace_back(std::thread(worker1, batch, i));
    threads.emplace_back(std::thread(worker2, batch, i + 1));
  }
  for (auto& th : threads) {
    th.join();
  }
  threads.clear();

  /* test the correct of APIs serially */
  batch = 1;
  std::cout << "[Batch 1] " << BATCH_1_SIZE << " threads\n";
  for (int i = BATCH_0_SIZE; i < BATCH_0_SIZE + BATCH_1_SIZE; i += 2) {
    auto th = std::thread(worker1, batch, i);
    th.join();
    th = std::thread(worker2, batch, i + 1);
    th.join();
  }

  /* eviction may occur */
  batch = 2;
  std::cout << "[Batch 2] " << BATCH_2_SIZE << " threads\n";
  for (int i = BATCH_0_SIZE + BATCH_1_SIZE; i < THREAD_N; i += 2) {
    threads.emplace_back(std::thread(worker1, batch, i));
    threads.emplace_back(std::thread(worker2, batch, i + 1));
  }
  for (auto& th : threads) {
    th.join();
  }
  ASSERT_EQ(table->capacity(), MAX_CAPACITY);
}

/*
 * This test focus on the compatibility of the value type.
 * In each batch, the batch size is less than the bucket capacity, so it's
 *   always true that the keys inserted in the last batch must exist in HKV.
 * Each kernel only be launched on one SM,
 *   therefore exclude the check of consistency across SMs.
 */
template <typename V, int Dim>
void test_value_type_hbm_mode() {
  std::cout << "size of V: " << sizeof(V) << ", dim: " << Dim << std::endl;
  using Table =
      nv::merlin::HashTable<K, V, S, nv::merlin::EvictStrategy::kCustomized>;
  using TableOptions = nv::merlin::HashTableOptions;
  constexpr uint64_t BUCKET_MAX_SIZE = 128;
  constexpr uint64_t INIT_CAPACITY = 256UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_TIMES = 2UL;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = Dim;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(16);

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores;
  V* d_vectors;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  uint64_t table_size_before = 0;
  uint64_t table_size_after = 0;
  uint64_t found_num = 0;
  uint64_t value_diff_cnt = 0;
  uint64_t table_size_verify = 0;

  table_size_verify = table->size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(table_size_verify, 0);

  K start_key = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    for (K i = 0; i < KEY_NUM; i++) {
      h_keys[i] = start_key + static_cast<K>(i);
      h_scores[i] = h_keys[i];
      for (size_t j = 0; j < options.dim; j++) {
        h_vectors[i * options.dim + j] = static_cast<V>(h_keys[i] * 0.1);
      }
    }
    start_key += KEY_NUM;

    // Step1 : insert new Keys.
    table_size_before = table->size(stream);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    table_size_after = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_LE(table_size_after, table_size_before + KEY_NUM);

    // Step2 : find new keys.
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    table_size_verify = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(table_size_verify, table_size_after);
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    found_num = 0;
    value_diff_cnt = 0;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_vectors[i * options.dim] == static_cast<V>(h_keys[i] * 0.1))
        ++found_num;
      for (int j = 0; j < options.dim; j++) {
        if (h_vectors[i * options.dim + j] != static_cast<V>(h_keys[i] * 0.1)) {
          ++value_diff_cnt;
          break;
        }
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);
    ASSERT_EQ(value_diff_cnt, 0);
    std::cout << "Check find_or_insert behavior got "
              << " key_miss_cnt: " << KEY_NUM - found_num
              << " value_diff_cnt: " << value_diff_cnt
              << " while table_size_before: " << table_size_before
              << ", while table_size_after: " << table_size_after
              << ", while len: " << KEY_NUM << std::endl;

    // Step3 : update old keys.
    for (int i = 0; i < KEY_NUM; i++) {
      h_scores[i] = h_keys[i];
      for (int j = 0; j < options.dim; j++) {
        h_vectors[i * options.dim + j] = static_cast<V>(h_keys[i] * 0.2);
      }
    }
    table_size_before = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    table->assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    table_size_after = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(table_size_before, table_size_after);

    // Step4 : find old keys.
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    table->find_or_insert(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    table_size_verify = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(table_size_verify, table_size_after);
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    found_num = 0;
    value_diff_cnt = 0;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_vectors[i * options.dim] == static_cast<V>(h_keys[i] * 0.2))
        ++found_num;
      for (int j = 0; j < options.dim; j++) {
        if (h_vectors[i * options.dim + j] != static_cast<V>(h_keys[i] * 0.2)) {
          ++value_diff_cnt;
          break;
        }
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);
    ASSERT_EQ(value_diff_cnt, 0);
    std::cout << "Check  assign        behavior got "
              << " key_miss_cnt: " << KEY_NUM - found_num
              << " value_diff_cnt: " << value_diff_cnt
              << " while table_size_before: " << table_size_before
              << ", while table_size_after: " << table_size_after
              << ", while len: " << KEY_NUM << std::endl;
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

template <typename K, typename V, typename S, typename Table, size_t dim = 64>
void CheckFindOrInsertValues(Table* table, K* keys, V* values, S* scores,
                             size_t len, cudaStream_t stream) {
  std::map<K, test_util::ValueArray<V, dim>> map_before_insert;
  std::map<K, test_util::ValueArray<V, dim>> map_after_insert;
  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;

  size_t table_size_before = table->size(stream);
  size_t cap = table_size_before + len;

  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));
  h_tmp_keys = (K*)malloc(cap * sizeof(K));
  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
  h_tmp_scores = (S*)malloc(cap * sizeof(S));

  size_t table_size_verify0 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);
  ASSERT_EQ(table_size_before, table_size_verify0);

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_before * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_before * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_before * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < table_size_verify0; i++) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    map_before_insert[h_tmp_keys[i]] = *vec;
  }

  auto start = std::chrono::steady_clock::now();
  table->find_or_insert(len, keys, values, nullptr, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::steady_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

  float dur = diff.count();

  size_t table_size_after = table->size(stream);
  size_t table_size_verify1 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

  ASSERT_EQ(table_size_verify1, table_size_after);

  size_t new_cap = table_size_after;
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_after * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_after * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_after * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int64_t new_cap_K = (int64_t)new_cap;
  for (int64_t i = new_cap_K - 1; i >= 0; i--) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    map_after_insert[h_tmp_keys[i]] = *vec;
  }

  size_t value_diff_cnt = 0;
  for (auto& it : map_after_insert) {
    test_util::ValueArray<V, dim>& vec = map_after_insert.at(it.first);
    for (size_t j = 0; j < dim; j++) {
      if (vec[j] != static_cast<float>(it.first * 0.00001)) {
        ++value_diff_cnt;
        break;
      }
    }
  }
  ASSERT_EQ(value_diff_cnt, 0);
  std::cout << "Check find_or_insert behavior got "
            << "value_diff_cnt: " << value_diff_cnt
            << ", while table_size_before: " << table_size_before
            << ", while table_size_after: " << table_size_after
            << ", while len: " << len << std::endl;

  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
  free(h_tmp_keys);
  free(h_tmp_values);
  free(h_tmp_scores);
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

void test_find_or_insert_values_check(size_t max_hbm_for_vectors) {
  const size_t U = 524288;
  const size_t init_capacity = 1024;
  const size_t B = 524288 + 13;
  constexpr size_t dim = 64;

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;
  opt.dim = 64;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<K, V, S> data_buffer;
  data_buffer.Reserve(B, dim, stream);

  size_t offset = 0;
  S score = 0;
  for (int i = 0; i < 20; i++) {
    test_util::create_random_keys<K, S, V, dim>(
        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),
        data_buffer.values_ptr(false), (int)B, B * 16);
    data_buffer.SyncData(true, stream);

    CheckFindOrInsertValues<K, V, S, Table, dim>(
        table.get(), data_buffer.keys_ptr(), data_buffer.values_ptr(),
        data_buffer.scores_ptr(), B, stream);

    offset += B;
    score += 1;
  }
}

TEST(FindOrInsertTest, test_export_batch_if) {
  test_export_batch_if(16);
  test_export_batch_if(0, 31);
}
TEST(FindOrInsertTest, test_find_or_insert_multi_threads) {
  test_find_or_insert_multi_threads(16, 0.25f, 0.125f);
  test_find_or_insert_multi_threads(16, 0.375f, 0.125f);
  test_find_or_insert_multi_threads(0, 0.25f, 0.125f);
  test_find_or_insert_multi_threads(0, 0.375f, 0.125f);
}
TEST(FindOrInsertTest, test_value_type_hbm_mode) {
  test_value_type_hbm_mode<int8_t, 64>();
  test_value_type_hbm_mode<int8_t, 256>();
  test_value_type_hbm_mode<int8_t, 512>();

  test_value_type_hbm_mode<uint8_t, 63>();
  test_value_type_hbm_mode<uint8_t, 255>();
  test_value_type_hbm_mode<uint8_t, 511>();

  test_value_type_hbm_mode<int16_t, 32>();
  test_value_type_hbm_mode<int16_t, 128>();
  test_value_type_hbm_mode<int16_t, 256>();

  test_value_type_hbm_mode<int, 16>();
  test_value_type_hbm_mode<int, 64>();
  test_value_type_hbm_mode<float, 128>();

  test_value_type_hbm_mode<int64_t, 31>();
  test_value_type_hbm_mode<double, 63>();
}
TEST(FindOrInsertTest, test_basic) {
  test_basic(16, 61);
  test_basic(0);
}
TEST(FindOrInsertTest, test_basic_when_full) {
  test_basic_when_full(16);
  test_basic_when_full(0, 41);
}
TEST(FindOrInsertTest, test_erase_if_pred) {
  test_erase_if_pred(16);
  test_erase_if_pred(0, 17);
}
TEST(FindOrInsertTest, test_rehash) {
  test_rehash(16);
  test_rehash(0, 22);
}
TEST(FindOrInsertTest, test_rehash_on_big_batch) {
  test_rehash_on_big_batch(16, 37);
  test_rehash_on_big_batch(0);
}
TEST(FindOrInsertTest, test_dynamic_rehash_on_multi_threads) {
  test_dynamic_rehash_on_multi_threads(16, 22);
  test_dynamic_rehash_on_multi_threads(0);
}
TEST(FindOrInsertTest, test_basic_for_cpu_io) {
  test_basic_for_cpu_io(45);
  test_basic_for_cpu_io();
}
TEST(FindOrInsertTest, test_evict_strategy_lru_basic) {
  test_evict_strategy_lru_basic(16);
  test_evict_strategy_lru_basic(0, 44);
}
TEST(FindOrInsertTest, test_evict_strategy_lfu_basic) {
  test_evict_strategy_lfu_basic(16, 34);
  test_evict_strategy_lfu_basic(0);
}
TEST(FindOrInsertTest, test_evict_strategy_epochlru_basic) {
  test_evict_strategy_epochlru_basic(16, 41);
  test_evict_strategy_epochlru_basic(0);
}
TEST(FindOrInsertTest, test_evict_strategy_epochlfu_basic) {
  test_evict_strategy_epochlfu_basic(16, 42);
  test_evict_strategy_epochlfu_basic(0);
}
TEST(FindOrInsertTest, test_evict_strategy_customized_basic) {
  test_evict_strategy_customized_basic(16);
  test_evict_strategy_customized_basic(0, 43);
}
TEST(FindOrInsertTest, test_evict_strategy_customized_advanced) {
  test_evict_strategy_customized_advanced(16, 54);
  test_evict_strategy_customized_advanced(0);
}
TEST(FindOrInsertTest, test_assign_advanced_on_epochlfu) {
  test_assign_advanced_on_epochlfu(16);
}
TEST(FindOrInsertTest, test_evict_strategy_customized_correct_rate) {
  // TODO(rhdong): after blossom CI issue is resolved, the skip logic.
  const bool skip_hmem_check = (nullptr != std::getenv("IS_BLOSSOM_CI"));
  test_evict_strategy_customized_correct_rate(16);
  if (!skip_hmem_check) {
    test_evict_strategy_customized_correct_rate(0);
  } else {
    std::cout << "The HMEM check is skipped in blossom CI!" << std::endl;
  }
}

TEST(FindOrInsertTest, test_find_or_insert_values_check) {
  test_find_or_insert_values_check(16);
  // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.
  test_find_or_insert_values_check(0);
}


================================================
FILE: tests/find_with_missed_keys_test.cc.cu
================================================
/*
 * Copyright (c) 2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <gtest/gtest.h>
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <vector>
#include "merlin_hashtable.cuh"
#include "test_util.cuh"

constexpr size_t DIM = 16;
using K = uint64_t;
using V = float;
using S = uint64_t;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;

void test_find(size_t max_hbm_for_vectors, size_t max_bucket_size,
               double load_factor, bool pipeline_lookup, int key_start = 0) {
  MERLIN_CHECK(load_factor >= 0.0 && load_factor <= 1.0,
               "Invalid `load_factor`");

  constexpr uint64_t INIT_CAPACITY = 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = INIT_CAPACITY;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  K* h_missed_keys;
  int* h_missed_indices;

  TableOptions options;
  options.reserved_key_start_bit = key_start;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::MB(max_hbm_for_vectors);
  if (pipeline_lookup) {
    options.max_bucket_size = 128;
  } else {
    options.max_bucket_size = 256;
  }
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_missed_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_missed_indices, KEY_NUM * sizeof(int)));

  K* d_keys;
  S* d_scores;
  V* d_vectors;
  K* d_missed_keys;
  int* d_missed_indices;
  int* d_missed_size;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_missed_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_missed_indices, KEY_NUM * sizeof(int)));
  CUDA_CHECK(cudaMalloc(&d_missed_size, sizeof(int)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  int missed_size;
  for (int i = 0; i < TEST_TIMES; ++i) {
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                                KEY_NUM);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));

    Table table;
    table.init(options);
    size_t size = table.size(stream);
    ASSERT_EQ(size, 0);

    size_t insert_num = (double)KEY_NUM * load_factor;
    table.insert_or_assign(insert_num, d_keys, d_vectors, d_scores, stream);
    table.find(KEY_NUM, d_keys, d_vectors, d_missed_keys, d_missed_indices,
               d_missed_size, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemcpy(&missed_size, d_missed_size, sizeof(int),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_missed_keys, d_missed_keys, missed_size * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_missed_indices, d_missed_indices,
                          missed_size * sizeof(int), cudaMemcpyDeviceToHost));

    if (insert_num == 0) {
      ASSERT_EQ(missed_size, KEY_NUM);
    } else {
      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));

      ASSERT_TRUE(missed_size > 0 && missed_size < KEY_NUM);
      std::vector<bool> founds(KEY_NUM, true);
      // Check missed
      for (int j = 0; j < missed_size; ++j) {
        int idx = h_missed_indices[i];
        ASSERT_TRUE(idx >= 0 && idx < KEY_NUM);
        ASSERT_EQ(h_keys[idx], h_missed_keys[i]);
        founds[idx] = false;
      }
      // Check hitted
      for (uint64_t j = 0; j < KEY_NUM; ++j) {
        if (founds[j]) {
          for (int k = 0; k < options.dim; ++k) {
            ASSERT_EQ(h_vectors[j * options.dim + k],
                      static_cast<float>(h_keys[j] * 0.00001));
          }
        }
      }
    }
  }

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_missed_keys));
  CUDA_CHECK(cudaFreeHost(h_missed_indices));
  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_missed_keys));
  CUDA_CHECK(cudaFree(d_missed_indices));
  CUDA_CHECK(cudaFree(d_missed_size));

  CudaCheckError();
}

TEST(FindTest, test_find_when_empty) {
  // pure HMEM
  test_find(0, 128, 0.0, true, 12);
  test_find(0, 256, 0.0, false);
  // hybrid
  test_find(32, 128, 0.0, true, 58);
  test_find(32, 256, 0.0, false);
  // pure HBM
  test_find(1024, 128, 0.0, true);
  test_find(1024, 256, 0.0, false, 12);
}

TEST(FindTest, test_find_when_full) {
  // pure HMEM
  test_find(0, 128, 1.0, true);
  test_find(0, 256, 1.0, false);
  // hybrid
  test_find(32, 128, 1.0, true);
  test_find(32, 256, 1.0, false, 60);
  // pure HBM
  test_find(1024, 128, 1.0, true);
  test_find(1024, 256, 1.0, false);
}

TEST(FindTest, test_find_load_factor) {
  // pure HMEM
  test_find(0, 128, 0.2, true, 45);
  test_find(0, 256, 0.2, false, 12);
  // hybrid
  test_find(32, 128, 0.2, true, 27);
  test_find(32, 256, 0.2, false, 53);
  // pure HBM
  test_find(1024, 128, 0.2, true, 9);
  test_find(1024, 256, 0.2, false, 38);

  // pure HMEM
  test_find(0, 128, 0.5, true, 21);
  test_find(0, 256, 0.5, false, 46);
  // hybrid
  test_find(32, 128, 0.5, true, 31);
  test_find(32, 256, 0.5, false, 59);
  // pure HBM
  test_find(1024, 128, 0.5, true, 4);
  test_find(1024, 256, 0.5, false, 22);

  // pure HMEM
  test_find(0, 128, 0.75, true, 11);
  test_find(0, 256, 0.75, false, 34);
  // hybrid
  test_find(32, 128, 0.75, true, 18);
  test_find(32, 256, 0.75, false, 47);
  // pure HBM
  test_find(1024, 128, 0.75, true, 7);
  test_find(1024, 256, 0.75, false, 29);
}


================================================
FILE: tests/group_lock_test.cc.cu
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <gtest/gtest.h>
#include <chrono>
#include <system_error>
#include <thread>
#include <vector>
#include "merlin/group_lock.cuh"

using namespace nv::merlin;
using namespace std::chrono_literals;

// Test the basic functionality of the group_shared_mutex
TEST(GroupSharedMutexTest, BasicFunctionality) {
  group_shared_mutex mutex;
  ASSERT_EQ(mutex.read_count(), 0);
  ASSERT_EQ(mutex.update_count(), 0);

  {
    // Multiple reads can acquire the lock simultaneously
    read_shared_lock read1(mutex);
    ASSERT_EQ(mutex.read_count(), 1);
    read_shared_lock read2(mutex);
    ASSERT_EQ(mutex.read_count(), 2);
  }
  ASSERT_EQ(mutex.read_count(), 0);
  ASSERT_EQ(mutex.update_count(), 0);

  {
    // A update is blocked by the reads
    update_shared_lock update(mutex, std::defer_lock);
    EXPECT_FALSE(update.owns_lock());
    ASSERT_EQ(mutex.read_count(), 0);
    ASSERT_EQ(mutex.update_count(), 0);
    update.lock();
    ASSERT_EQ(mutex.read_count(), 0);
    ASSERT_EQ(mutex.update_count(), 1);
    EXPECT_TRUE(update.owns_lock());
  }
  ASSERT_EQ(mutex.read_count(), 0);
  ASSERT_EQ(mutex.update_count(), 0);

  // A unique lock is also blocked by the reads
  {
    update_read_lock unique(mutex, std::defer_lock);
    ASSERT_EQ(mutex.read_count(), 0);
    ASSERT_EQ(mutex.update_count(), 0);
    EXPECT_FALSE(unique.owns_lock());
    unique.lock();
    EXPECT_TRUE(unique.owns_lock());
    ASSERT_EQ(mutex.read_count(), 1);
    ASSERT_EQ(mutex.update_count(), 1);

    EXPECT_DEATH(unique.lock(), "trying to lock twice!");
  }
  ASSERT_EQ(mutex.read_count(), 0);
  ASSERT_EQ(mutex.update_count(), 0);
}

TEST(GroupSharedMutexTest, AdvancedFunctionalitySingleStream) {
  group_shared_mutex mutex;
  bool multiple_read = false;
  bool multiple_update = false;

  // Test multiple reads
  std::vector<std::thread> reads;
  for (int i = 0; i < 50; ++i) {
    reads.emplace_back([&]() {
      read_shared_lock read(mutex);
      EXPECT_TRUE(mutex.read_count() > 0);
      if (mutex.read_count() > 1) multiple_read = true;
      std::this_thread::sleep_for(1000ms);
      ASSERT_EQ(mutex.update_count(), 0);
    });
  }

  // Test multiple updates
  std::vector<std::thread> updates;
  for (int i = 0; i < 50; ++i) {
    updates.emplace_back([&]() {
      update_shared_lock update(mutex);
      EXPECT_TRUE(mutex.update_count() > 0);
      if (mutex.update_count() > 1) multiple_update = true;
      std::this_thread::sleep_for(1000ms);
      ASSERT_EQ(mutex.read_count(), 0);
    });
  }

  // Test multiple uniques
  std::vector<std::thread> uniques;
  for (int i = 0; i < 50; ++i) {
    uniques.emplace_back([&]() {
      update_read_lock unique(mutex);
      ASSERT_EQ(mutex.read_count(), 1);
      ASSERT_EQ(mutex.update_count(), 1);
      std::this_thread::sleep_for(100ms);
    });
  }

  for (auto& th : reads) {
    th.join();
  }

  for (auto& th : updates) {
    th.join();
  }

  for (auto& th : uniques) {
    th.join();
  }

  EXPECT_TRUE(multiple_update);
  EXPECT_TRUE(multiple_read);
}

TEST(GroupSharedMutexTest, AdvancedFunctionalityMultiStream) {
  group_shared_mutex mutex;
  bool multiple_read = false;
  bool multiple_update = false;

  // Test multiple reads
  std::vector<std::thread> reads;
  for (int i = 0; i < 50; ++i) {
    reads.emplace_back([&]() {
      cudaStream_t stream;
      CUDA_CHECK(cudaStreamCreate(&stream));

      read_shared_lock read(mutex);
      EXPECT_TRUE(mutex.read_count() > 0);
      if (mutex.read_count() > 1) multiple_read = true;
      std::this_thread::sleep_for(1000ms);
      ASSERT_EQ(mutex.update_count(), 0);

      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaStreamDestroy(stream));
    });
  }

  // Test multiple updates
  std::vector<std::thread> updates;
  for (int i = 0; i < 50; ++i) {
    updates.emplace_back([&]() {
      cudaStream_t stream;
      CUDA_CHECK(cudaStreamCreate(&stream));

      update_shared_lock update(mutex);
      EXPECT_TRUE(mutex.update_count() > 0);
      if (mutex.update_count() > 1) multiple_update = true;
      std::this_thread::sleep_for(1000ms);
      ASSERT_EQ(mutex.read_count(), 0);

      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaStreamDestroy(stream));
    });
  }

  // Test multiple uniques
  std::vector<std::thread> uniques;
  for (int i = 0; i < 50; ++i) {
    uniques.emplace_back([&]() {
      cudaStream_t stream;
      CUDA_CHECK(cudaStreamCreate(&stream));

      update_read_lock unique(mutex);
      ASSERT_EQ(mutex.read_count(), 1);
      ASSERT_EQ(mutex.read_count(), 1);
      std::this_thread::sleep_for(100ms);

      CUDA_CHECK(cudaStreamSynchronize(stream));
      CUDA_CHECK(cudaStreamDestroy(stream));
    });
  }

  for (auto& th : reads) {
    th.join();
  }

  for (auto& th : updates) {
    th.join();
  }

  for (auto& th : uniques) {
    th.join();
  }

  EXPECT_TRUE(multiple_update);
  EXPECT_TRUE(multiple_read);
}


================================================
FILE: tests/insert_and_evict_test.cc.cu
================================================
/*
 * Copyright (c) 2023, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <gtest/gtest.h>
#include <stdio.h>
#include <array>
#include <map>
#include <unordered_map>
#include "merlin/types.cuh"
#include "merlin_hashtable.cuh"
#include "merlin_localfile.hpp"
#include "test_util.cuh"

constexpr size_t dim = 64;
using i64 = int64_t;
using u64 = uint64_t;
using f32 = float;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;

/*
 * There are several steps to check whether if
 * the insert_and_evict API is safe to use:
 *
 *   step1: Create a table with max_capacity U
 *   step2: Insert M keys into table while M < U. And
 *     the table size became m <= M. M - m keys was
 *     evicted.
 *   step3: Insert N keys into table while m + N > U, with
 *     no same key with M keys. And p keys gets evicted.
 *     If now the table size is v. Then total number of
 *     keys T = v + p + M - m, must equal to VT = M + N,
 *     while the keys, values, and scores match.
 *   step4: export table and check all values.
 */
void test_insert_and_evict_basic() {
  TableOptions opt;

  // table setting
  const size_t init_capacity = 1024;

  // numeric setting
  const size_t U = 2llu << 18;
  const size_t M = (U >> 1);
  const size_t N = (U >> 1) + 17;  // Add a prime to test the non-aligned case.

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = U * dim * sizeof(f32);
  opt.num_of_buckets_per_alloc = 8;

  using Table =
      nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kCustomized>;
  opt.dim = dim;

  std::map<i64, test_util::ValueArray<f32, dim>> summarized_kvs;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  // step1
  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  // step2
  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;
  evict_buffer.Reserve(M, dim, stream);
  evict_buffer.ToZeros(stream);

  test_util::KVMSBuffer<i64, f32, u64> buffer;
  buffer.Reserve(M, dim, stream);
  buffer.ToRange(0, 1, stream);
  buffer.Setscore((u64)1, stream);
  size_t n_evicted = table->insert_and_evict(
      M, buffer.keys_ptr(), buffer.values_ptr(), buffer.scores_ptr(),
      evict_buffer.keys_ptr(), evict_buffer.values_ptr(),
      evict_buffer.scores_ptr(), stream);
  size_t table_size_m = table->size(stream);
  buffer.SyncData(/*h2d=*/false, stream);
  evict_buffer.SyncData(/*h2d=*/false, stream);
  ASSERT_EQ(n_evicted + table_size_m, M);
  for (size_t i = 0; i < n_evicted; i++) {
    test_util::ValueArray<f32, dim>* vec =
        reinterpret_cast<test_util::ValueArray<f32, dim>*>(
            evict_buffer.values_ptr(false) + i * dim);
    summarized_kvs.emplace(evict_buffer.keys_ptr(false)[i], *vec);
  }
  CUDA_CHECK(cudaStreamSynchronize(stream));

  //  step3
  evict_buffer.Reserve(N, dim, stream);
  buffer.Reserve(N, dim, stream);
  buffer.ToRange(M, 1, stream);
  buffer.Setscore((u64)2, stream);
  n_evicted = table->insert_and_evict(
      N, buffer.keys_ptr(), buffer.values_ptr(), buffer.scores_ptr(),
      evict_buffer.keys_ptr(), evict_buffer.values_ptr(),
      evict_buffer.scores_ptr(), stream);
  size_t table_size_n = table->size(stream);
  buffer.SyncData(/*h2d=*/false, stream);
  evict_buffer.SyncData(/*h2d=*/false, stream);
  ASSERT_EQ(table_size_m + N, table_size_n + n_evicted);
  for (size_t i = 0; i < n_evicted; i++) {
    test_util::ValueArray<f32, dim>* vec =
        reinterpret_cast<test_util::ValueArray<f32, dim>*>(
            evict_buffer.values_ptr(false) + i * dim);
    summarized_kvs.emplace(evict_buffer.keys_ptr(false)[i], *vec);
  }
  CUDA_CHECK(cudaStreamSynchronize(stream));

  // step4
  buffer.Reserve(table_size_n, dim, stream);
  size_t n_exported =
      table->export_batch(table->capacity(), 0, buffer.keys_ptr(),
                          buffer.values_ptr(), buffer.scores_ptr(), stream);
  ASSERT_EQ(table_size_n, n_exported);
  buffer.SyncData(/*h2d=*/false, stream);
  for (size_t i = 0; i < n_exported; i++) {
    test_util::ValueArray<f32, dim>* vec =
        reinterpret_cast<test_util::ValueArray<f32, dim>*>(
            buffer.values_ptr(false) + i * dim);
    summarized_kvs.emplace(buffer.keys_ptr(false)[i], *vec);
  }

  CUDA_CHECK(cudaStreamSynchronize(stream));
  buffer.Free(stream);
  evict_buffer.Free(stream);

  size_t k = 0;
  for (auto it = summarized_kvs.begin(); it != summarized_kvs.end(); it++) {
    i64 key = it->first;
    test_util::ValueArray<f32, dim>& value = it->second;
    ASSERT_EQ(key, (i64)k);
    for (size_t j = 0; j < dim; j++) {
      ASSERT_EQ(value[j], (f32)k);
    }
    ++k;
  }
  ASSERT_EQ(summarized_kvs.size(), M + N);
  summarized_kvs.clear();
}

template <typename K, typename V, typename S, typename Table>
void CheckInsertAndEvict(Table* table, K* keys, V* values, S* scores,
                         K* evicted_keys, V* evicted_values, S* evicted_scores,
                         size_t len, cudaStream_t stream, TableOptions& opt) {
  std::map<i64, test_util::ValueArray<f32, dim>> map_before_insert;
  std::map<i64, test_util::ValueArray<f32, dim>> map_after_insert;
  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;
  bool* h_tmp_founds = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;
  bool* d_tmp_founds = nullptr;

  size_t table_size_before = table->size(stream);
  size_t cap = table_size_before + len;

  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));
  h_tmp_keys = (K*)malloc(cap * sizeof(K));
  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
  h_tmp_scores = (S*)malloc(cap * sizeof(S));
  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));

  size_t table_size_verify0 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);
  ASSERT_EQ(table_size_before, table_size_verify0);

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_before * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_before * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_before * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys,
                             len * sizeof(K), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim, values,
                             len * dim * sizeof(V), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before, scores,
                             len * sizeof(S), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < cap; i++) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    map_before_insert[h_tmp_keys[i]] = *vec;
  }

  auto start = std::chrono::steady_clock::now();
  size_t filtered_len = table->insert_and_evict(
      len, keys, values,
      Table::evict_strategy == EvictStrategy::kLru ? nullptr : scores,
      evicted_keys, evicted_values, evicted_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::steady_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

  {
    table->find(len, keys, values, d_tmp_founds, scores, stream);
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),
                               cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaStreamSynchronize(stream));
    size_t found_counter = 0;
    for (int i = 0; i < len; i++) {
      if (h_tmp_founds[i]) found_counter++;
    }
    std::cout << "filtered_len:" << filtered_len
              << ", miss counter:" << len - found_counter << std::endl;

    CUDA_CHECK(cudaMemset(d_tmp_founds, 0, len * sizeof(bool)));
    table->contains(len, keys, d_tmp_founds, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int contains_counter = 0;
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),
                               cudaMemcpyDeviceToHost, stream));
    for (int i = 0; i < len; i++) {
      if (h_tmp_founds[i]) contains_counter++;
    }
    ASSERT_EQ(contains_counter, found_counter);
  }

  float dur = diff.count();

  size_t table_size_after = table->size(stream);
  size_t table_size_verify1 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

  ASSERT_EQ(table_size_verify1, table_size_after);

  size_t new_cap = table_size_after + filtered_len;
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_after * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_after * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_after * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,
                             filtered_len * sizeof(K), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,
                             evicted_values, filtered_len * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after, evicted_scores,
                             filtered_len * sizeof(S), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int64_t new_cap_i64 = (int64_t)new_cap;
  for (int64_t i = new_cap_i64 - 1; i >= 0; i--) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    map_after_insert[h_tmp_keys[i]] = *vec;
  }

  size_t key_miss_cnt = 0;
  size_t value_diff_cnt = 0;
  for (auto& it : map_before_insert) {
    if (map_after_insert.find(it.first) == map_after_insert.end()) {
      ++key_miss_cnt;
      continue;
    }
    test_util::ValueArray<V, dim>& vec0 = it.second;
    test_util::ValueArray<V, dim>& vec1 = map_after_insert.at(it.first);
    for (size_t j = 0; j < dim; j++) {
      if (vec0[j] != vec1[j]) {
        ++value_diff_cnt;
        break;
      }
    }
  }
  std::cout << "Check insert_and_evict behavior got "
            << "key_miss_cnt: " << key_miss_cnt
            << ", value_diff_cnt: " << value_diff_cnt
            << ", while table_size_before: " << table_size_before
            << ", while table_size_after: " << table_size_after
            << ", while len: " << len << ", dur: " << dur << std::endl;

  ASSERT_EQ(key_miss_cnt, 0);
  ASSERT_EQ(value_diff_cnt, 0);

  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));
  free(h_tmp_keys);
  free(h_tmp_values);
  free(h_tmp_scores);
  free(h_tmp_founds);
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

void test_insert_and_evict_advanced_on_lru() {
  const size_t U = 524288;
  const size_t init_capacity = U;
  const size_t B = 524288 + 13;

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = U * dim * sizeof(f32);
  opt.max_bucket_size = 128;
  opt.num_of_buckets_per_alloc = 32;
  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kLru>;
  opt.dim = dim;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;
  evict_buffer.Reserve(B, dim, stream);
  evict_buffer.ToZeros(stream);

  test_util::KVMSBuffer<i64, f32, u64> data_buffer;
  data_buffer.Reserve(B, dim, stream);

  size_t offset = 0;
  u64 score = 0;
  for (int i = 0; i < 16; i++) {
    test_util::create_random_keys<i64, u64, f32, dim>(
        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),
        data_buffer.values_ptr(false), (int)B, B * 16);
    data_buffer.SyncData(true, stream);

    CheckInsertAndEvict<i64, f32, u64, Table>(
        table.get(), data_buffer.keys_ptr(), data_buffer.values_ptr(),
        data_buffer.scores_ptr(), evict_buffer.keys_ptr(),
        evict_buffer.values_ptr(), evict_buffer.scores_ptr(), B, stream, opt);

    offset += B;
    score += 1;
  }
}

template <typename K, typename V, typename S, typename Table>
void CheckInsertAndEvictOnLfu(Table* table,
                              test_util::KVMSBuffer<K, V, S>* data_buffer,
                              test_util::KVMSBuffer<K, V, S>* evict_buffer,
                              size_t len, cudaStream_t stream,
                              TableOptions& opt, unsigned int global_epoch) {
  std::map<K, test_util::ValueArray<V, dim>> values_map_before_insert;
  std::map<K, test_util::ValueArray<V, dim>> values_map_after_insert;

  std::unordered_map<K, S> scores_map_before_insert;
  std::map<K, S> scores_map_after_insert;

  std::map<K, S> scores_map_current_batch;
  std::map<K, S> scores_map_current_evict;

  K* keys = data_buffer->keys_ptr();
  V* values = data_buffer->values_ptr();
  S* scores = data_buffer->scores_ptr();

  K* evicted_keys = evict_buffer->keys_ptr();
  V* evicted_values = evict_buffer->values_ptr();
  S* evicted_scores = evict_buffer->scores_ptr();

  for (size_t i = 0; i < len; i++) {
    scores_map_current_batch[data_buffer->keys_ptr(false)[i]] =
        data_buffer->scores_ptr(false)[i];
  }

  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;
  bool* h_tmp_founds = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;
  bool* d_tmp_founds = nullptr;

  size_t table_size_before = table->size(stream);
  size_t cap = table_size_before + len;

  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));
  h_tmp_keys = (K*)malloc(cap * sizeof(K));
  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
  h_tmp_scores = (S*)malloc(cap * sizeof(S));
  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));

  size_t table_size_verify0 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);
  ASSERT_EQ(table_size_before, table_size_verify0);

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_before * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_before * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_before * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys,
                             len * sizeof(K), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim, values,
                             len * dim * sizeof(V), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before, scores,
                             len * sizeof(S), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < cap; i++) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    values_map_before_insert[h_tmp_keys[i]] = *vec;
  }

  for (size_t i = 0; i < table_size_before; i++) {
    scores_map_before_insert[h_tmp_keys[i]] = h_tmp_scores[i];
  }

  auto start = std::chrono::steady_clock::now();
  table->set_global_epoch(global_epoch);
  size_t filtered_len = table->insert_and_evict(
      len, keys, values,
      (Table::evict_strategy == EvictStrategy::kLru ||
       Table::evict_strategy == EvictStrategy::kEpochLru)
          ? nullptr
          : scores,
      evicted_keys, evicted_values, evicted_scores, stream);
  evict_buffer->SyncData(false, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::steady_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

  {
    table->find(len, keys, values, d_tmp_founds, scores, stream);
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),
                               cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaStreamSynchronize(stream));
    size_t found_counter = 0;
    for (int i = 0; i < len; i++) {
      if (h_tmp_founds[i]) found_counter++;
    }

    CUDA_CHECK(cudaMemset(d_tmp_founds, 0, len * sizeof(bool)));
    table->contains(len, keys, d_tmp_founds, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int contains_counter = 0;
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),
                               cudaMemcpyDeviceToHost, stream));
    for (int i = 0; i < len; i++) {
      if (h_tmp_founds[i]) contains_counter++;
    }
    ASSERT_EQ(contains_counter, found_counter);
  }

  for (size_t i = 0; i < filtered_len; i++) {
    scores_map_current_evict[evict_buffer->keys_ptr(false)[i]] =
        evict_buffer->scores_ptr(false)[i];
  }

  float dur = diff.count();

  size_t table_size_after = table->size(stream);
  size_t table_size_verify1 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

  ASSERT_EQ(table_size_verify1, table_size_after);

  size_t new_cap = table_size_after + filtered_len;
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_after * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_after * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_after * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,
                             filtered_len * sizeof(K), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,
                             evicted_values, filtered_len * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after, evicted_scores,
                             filtered_len * sizeof(S), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int64_t new_cap_i64 = (int64_t)new_cap;

  size_t key_miss_cnt = 0;
  size_t value_diff_cnt = 0;
  size_t score_error_cnt = 0;

  for (int64_t i = new_cap_i64 - 1; i >= 0; i--) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    values_map_after_insert[h_tmp_keys[i]] = *vec;
    scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];
  }

  for (auto it : scores_map_current_batch) {
    const K key = it.first;
    const K score = it.second;
    S current_score = scores_map_after_insert[key];
    S score_before_insert = 0;
    if (scores_map_before_insert.find(key) != scores_map_before_insert.end() &&
        scores_map_current_evict.find(key) == scores_map_current_evict.end()) {
      score_before_insert = scores_map_before_insert[key];
    } else {
      continue;
    }
    bool valid = (current_score == score + score_before_insert);
    if (!valid) {
      score_error_cnt++;
    }
  }

  ASSERT_EQ(values_map_before_insert.size(), values_map_after_insert.size());

  for (auto& it : values_map_before_insert) {
    if (values_map_after_insert.find(it.first) ==
        values_map_after_insert.end()) {
      ++key_miss_cnt;
      continue;
    }
    test_util::ValueArray<V, dim>& vec0 = it.second;
    test_util::ValueArray<V, dim>& vec1 = values_map_after_insert.at(it.first);
    for (size_t j = 0; j < dim; j++) {
      if (vec0[j] != vec1[j]) {
        ++value_diff_cnt;
      }
    }
  }
  std::cout << "Check insert_and_evict behavior got "
            << "key_miss_cnt: " << key_miss_cnt
            << ", value_diff_cnt: " << value_diff_cnt
            << ", score_error_cnt: " << score_error_cnt
            << ", while table_size_before: " << table_size_before
            << ", while table_size_after: " << table_size_after
            << ", while len: " << len << ", dur: " << dur << std::endl;

  ASSERT_EQ(key_miss_cnt, 0);
  ASSERT_EQ(score_error_cnt, 0);
  ASSERT_EQ(value_diff_cnt, 0);

  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));
  free(h_tmp_keys);
  free(h_tmp_values);
  free(h_tmp_scores);
  free(h_tmp_founds);
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

void test_insert_and_evict_advanced_on_lfu() {
  const size_t U = 1024 * 1024;
  const size_t init_capacity = U;
  const size_t B = 256 * 1024;

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = U * dim * sizeof(f32);
  opt.max_bucket_size = 128;
  opt.num_of_buckets_per_alloc = 32;
  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kLfu>;
  opt.dim = dim;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;
  evict_buffer.Reserve(B, dim, stream);
  evict_buffer.ToZeros(stream);

  test_util::KVMSBuffer<i64, f32, u64> data_buffer;
  data_buffer.Reserve(B, dim, stream);

  size_t offset = 0;
  for (unsigned int global_epoch = 1; global_epoch <= 32; global_epoch++) {
    test_util::create_random_keys_advanced<i64, u64, f32>(
        dim, data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),
        data_buffer.values_ptr(false), (int)B, B * 16, 100);
    data_buffer.SyncData(true, stream);

    CheckInsertAndEvictOnLfu<i64, f32, u64, Table>(
        table.get(), &data_buffer, &evict_buffer, B, stream, opt, global_epoch);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    offset += B;
  }
}

template <typename K, typename V, typename S, typename Table>
void CheckInsertAndEvictOnEpochLru(Table* table,
                                   test_util::KVMSBuffer<K, V, S>* data_buffer,
                                   test_util::KVMSBuffer<K, V, S>* evict_buffer,
                                   size_t len, cudaStream_t stream,
                                   TableOptions& opt,
                                   unsigned int global_epoch) {
  std::map<K, test_util::ValueArray<V, dim>> values_map_before_insert;
  std::map<K, test_util::ValueArray<V, dim>> values_map_after_insert;

  std::map<K, S> scores_map_before_insert;
  std::map<K, S> scores_map_after_insert;

  std::map<K, S> scores_map_current_batch;

  K* keys = data_buffer->keys_ptr();
  V* values = data_buffer->values_ptr();
  S* scores = data_buffer->scores_ptr();

  K* evicted_keys = evict_buffer->keys_ptr();
  V* evicted_values = evict_buffer->values_ptr();
  S* evicted_scores = evict_buffer->scores_ptr();

  for (size_t i = 0; i < len; i++) {
    scores_map_current_batch[data_buffer->keys_ptr(false)[i]] =
        data_buffer->scores_ptr(false)[i];
  }

  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;
  bool* h_tmp_founds = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;
  bool* d_tmp_founds = nullptr;

  size_t table_size_before = table->size(stream);
  size_t cap = table_size_before + len;

  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));
  h_tmp_keys = (K*)malloc(cap * sizeof(K));
  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
  h_tmp_scores = (S*)malloc(cap * sizeof(S));
  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));

  size_t table_size_verify0 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);
  ASSERT_EQ(table_size_before, table_size_verify0);

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_before * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_before * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_before * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys,
                             len * sizeof(K), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim, values,
                             len * dim * sizeof(V), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before, scores,
                             len * sizeof(S), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < cap; i++) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    values_map_before_insert[h_tmp_keys[i]] = *vec;
  }

  for (size_t i = 0; i < table_size_before; i++) {
    scores_map_before_insert[h_tmp_keys[i]] = h_tmp_scores[i];
  }

  S nano_before_insert = test_util::host_nano<S>();

  auto start = std::chrono::steady_clock::now();
  table->set_global_epoch(global_epoch);
  size_t filtered_len = table->insert_and_evict(
      len, keys, values,
      (Table::evict_strategy == EvictStrategy::kLru ||
       Table::evict_strategy == EvictStrategy::kEpochLru)
          ? nullptr
          : scores,
      evicted_keys, evicted_values, evicted_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::steady_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

  S nano_after_insert = test_util::host_nano<S>();

  {
    table->find(len, keys, values, d_tmp_founds, scores, stream);
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),
                               cudaMemcpyDeviceToHost, stream));
    CUDA_CHECK(cudaStreamSynchronize(stream));
    size_t found_counter = 0;
    for (int i = 0; i < len; i++) {
      if (h_tmp_founds[i]) found_counter++;
    }
    std::cout << "filtered_len:" << filtered_len
              << ", miss counter:" << len - found_counter << std::endl;
    ASSERT_EQ(len, found_counter);

    CUDA_CHECK(cudaMemset(d_tmp_founds, 0, len * sizeof(bool)));
    table->contains(len, keys, d_tmp_founds, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int contains_counter = 0;
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),
                               cudaMemcpyDeviceToHost, stream));
    for (int i = 0; i < len; i++) {
      if (h_tmp_founds[i]) contains_counter++;
    }
    ASSERT_EQ(contains_counter, found_counter);
  }

  float dur = diff.count();

  size_t table_size_after = table->size(stream);
  size_t table_size_verify1 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

  ASSERT_EQ(table_size_verify1, table_size_after);

  size_t new_cap = table_size_after + filtered_len;
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_after * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_after * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_after * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,
                             filtered_len * sizeof(K), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,
                             evicted_values, filtered_len * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after, evicted_scores,
                             filtered_len * sizeof(S), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int64_t new_cap_i64 = (int64_t)new_cap;

  size_t key_miss_cnt = 0;
  size_t value_diff_cnt = 0;
  size_t score_error_cnt1 = 0;
  size_t score_error_cnt2 = 0;

  for (int64_t i = new_cap_i64 - 1; i >= 0; i--) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    values_map_after_insert[h_tmp_keys[i]] = *vec;
    scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];
    if (i >= (new_cap_i64 - filtered_len)) {
      bool valid = ((h_tmp_scores[i] >> 32) < (global_epoch - 2));
      if (!valid) {
        score_error_cnt1++;
      }
    }
  }

  for (auto& it : scores_map_current_batch) {
    S score = scores_map_after_insert[it.first];
    bool valid =
        ((score >> 32) == global_epoch) &&
        ((score & 0xFFFFFFFF) >= (0xFFFFFFFF & (nano_before_insert >> 20))) &&
        ((score & 0xFFFFFFFF) <= (0xFFFFFFFF & (nano_after_insert >> 20)));
    if (!valid) {
      score_error_cnt2++;
    }
  }
  for (auto& it : values_map_before_insert) {
    if (values_map_after_insert.find(it.first) ==
        values_map_after_insert.end()) {
      ++key_miss_cnt;
      continue;
    }
    test_util::ValueArray<V, dim>& vec0 = it.second;
    test_util::ValueArray<V, dim>& vec1 = values_map_after_insert.at(it.first);
    for (size_t j = 0; j < dim; j++) {
      if (vec0[j] != vec1[j]) {
        ++value_diff_cnt;
        break;
      }
    }
  }
  std::cout << "Check insert_and_evict behavior got "
            << "key_miss_cnt: " << key_miss_cnt
            << ", value_diff_cnt: " << value_diff_cnt
            << ", score_error_cnt1: " << score_error_cnt1
            << ", score_error_cnt2: " << score_error_cnt2
            << ", while table_size_before: " << table_size_before
            << ", while table_size_after: " << table_size_after
            << ", while len: " << len << ", dur: " << dur << std::endl;

  ASSERT_EQ(key_miss_cnt, 0);
  ASSERT_EQ(value_diff_cnt, 0);
  ASSERT_EQ(score_error_cnt1, 0);
  ASSERT_EQ(score_error_cnt2, 0);

  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));
  free(h_tmp_keys);
  free(h_tmp_values);
  free(h_tmp_scores);
  free(h_tmp_founds);
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

void test_insert_and_evict_advanced_on_epochlru() {
  const size_t U = 1024 * 1024;
  const size_t init_capacity = U;
  const size_t B = 128 * 1024;

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = U * dim * sizeof(f32);
  opt.max_bucket_size = 128;
  opt.num_of_buckets_per_alloc = 32;
  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kEpochLru>;
  opt.dim = dim;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;
  evict_buffer.Reserve(B, dim, stream);
  evict_buffer.ToZeros(stream);

  test_util::KVMSBuffer<i64, f32, u64> data_buffer;
  data_buffer.Reserve(B, dim, stream);

  size_t offset = 0;
  u64 score = 0;
  for (unsigned int global_epoch = 1; global_epoch <= 64; global_epoch++) {
    test_util::create_random_keys_advanced<i64, u64, f32>(
        dim, data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),
        data_buffer.values_ptr(false), (int)B, B * 16);
    data_buffer.SyncData(true, stream);

    CheckInsertAndEvictOnEpochLru<i64, f32, u64, Table>(
        table.get(), &data_buffer, &evict_buffer, B, stream, opt, global_epoch);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    offset += B;
    score += 1;
  }
}

template <typename K, typename V, typename S, typename Table>
void CheckInsertAndEvictOnEpochLfu(
    Table* table, test_util::KVMSBuffer<K, V, S>* data_buffer,
    test_util::KVMSBuffer<K, V, S>* evict_buffer,
    test_util::KVMSBuffer<K, V, S>* pre_data_buffer, size_t len,
    cudaStream_t stream, TableOptions& opt, unsigned int global_epoch) {
  std::map<K, test_util::ValueArray<V, dim>> values_map_before_insert;
  std::map<K, test_util::ValueArray<V, dim>> values_map_after_insert;

  std::unordered_map<K, S> scores_map_before_insert;
  std::map<K, S> scores_map_after_insert;

  std::map<K, S> scores_map_current_batch;
  std::map<K, S> scores_map_current_evict;

  K* keys = data_buffer->keys_ptr();
  V* values = data_buffer->values_ptr();
  S* scores = data_buffer->scores_ptr();

  K* evicted_keys = evict_buffer->keys_ptr();
  V* evicted_values = evict_buffer->values_ptr();
  S* evicted_scores = evict_buffer->scores_ptr();

  for (size_t i = 0; i < len; i++) {
    scores_map_current_batch[data_buffer->keys_ptr(false)[i]] =
        data_buffer->scores_ptr(false)[i];
  }

  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;
  bool* h_tmp_founds = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;
  bool* d_tmp_founds = nullptr;

  size_t table_size_before = table->size(stream);
  size_t cap = table_size_before + len;

  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));
  h_tmp_keys = (K*)malloc(cap * sizeof(K));
  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
  h_tmp_scores = (S*)malloc(cap * sizeof(S));
  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));

  size_t table_size_verify0 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);
  ASSERT_EQ(table_size_before, table_size_verify0);

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_before * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_before * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_before * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys,
                             len * sizeof(K), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim, values,
                             len * dim * sizeof(V), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before, scores,
                             len * sizeof(S), cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < cap; i++) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    values_map_before_insert[h_tmp_keys[i]] = *vec;
  }

  for (size_t i = 0; i < table_size_before; i++) {
    scores_map_before_insert[h_tmp_keys[i]] = h_tmp_scores[i];
  }

  auto start = std::chrono::steady_clock::now();
  table->set_global_epoch(global_epoch);
  size_t filtered_len = table->insert_and_evict(
      len, keys, values,
      (Table::evict_strategy == EvictStrategy::kLru ||
       Table::evict_strategy == EvictStrategy::kEpochLru)
          ? nullptr
          : scores,
      evicted_keys, evicted_values, evicted_scores, stream);
  evict_buffer->SyncData(false, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::steady_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

  {
    table->find(len, pre_data_buffer->keys_ptr(), values, d_tmp_founds,
                pre_data_buffer->scores_ptr(), stream);
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),
                               cudaMemcpyDeviceToHost, stream));
    pre_data_buffer->SyncData(false);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    size_t found_counter = 0;
    size_t old_epoch_counter = 0;
    size_t new_epoch_counter = 0;
    for (int i = 0; i < len; i++) {
      if (h_tmp_founds[i]) found_counter++;
      S score = pre_data_buffer->scores_ptr(false)[i];
      S cur_epoch = score >> 32;
      if (global_epoch == cur_epoch) new_epoch_counter++;
      if (global_epoch - 1 == cur_epoch) old_epoch_counter++;
    }
    ASSERT_EQ(len, new_epoch_counter + old_epoch_counter);
    std::cout << "old_epoch_counter:" << old_epoch_counter
              << ", new_epoch_counter:" << new_epoch_counter << std::endl
              << ", pre_data filtered_len:" << filtered_len
              << ", pre_data miss counter:" << len - found_counter << std::endl;
    ASSERT_EQ(len, found_counter);

    CUDA_CHECK(cudaMemset(d_tmp_founds, 0, len * sizeof(bool)));
    table->contains(len, pre_data_buffer->keys_ptr(), d_tmp_founds, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int contains_counter = 0;
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),
                               cudaMemcpyDeviceToHost, stream));
    for (int i = 0; i < len; i++) {
      if (h_tmp_founds[i]) contains_counter++;
    }
    ASSERT_EQ(contains_counter, found_counter);
  }

  {
    table->find(len, keys, values, d_tmp_founds, scores, stream);
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),
                               cudaMemcpyDeviceToHost, stream));
    data_buffer->SyncData(false);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    size_t found_counter = 0;
    size_t new_epoch_counter = 0;
    for (int i = 0; i < len; i++) {
      S score = data_buffer->scores_ptr(false)[i];
      S cur_epoch = score >> 32;
      if (h_tmp_founds[i]) found_counter++;
      if (global_epoch == cur_epoch) new_epoch_counter++;
    }
    ASSERT_EQ(len, new_epoch_counter);
    std::cout << "filtered_len:" << filtered_len
              << ", miss counter:" << len - found_counter << std::endl;
    ASSERT_EQ(len, found_counter);

    CUDA_CHECK(cudaMemset(d_tmp_founds, 0, len * sizeof(bool)));
    table->contains(len, keys, d_tmp_founds, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int contains_counter = 0;
    CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, len * sizeof(bool),
                               cudaMemcpyDeviceToHost, stream));
    for (int i = 0; i < len; i++) {
      if (h_tmp_founds[i]) contains_counter++;
    }
    ASSERT_EQ(contains_counter, found_counter);
  }

  {
    std::unordered_set<K> unique_keys;
    for (int i = 0; i < len; i++) {
      unique_keys.insert(data_buffer->keys_ptr(false)[i]);
      unique_keys.insert(pre_data_buffer->keys_ptr(false)[i]);
    }
    float repeat_rate = (len * 2.0 - unique_keys.size()) / (len * 1.0);
    std::cout << "repeat_rate:" << repeat_rate << std::endl;
  }

  for (size_t i = 0; i < filtered_len; i++) {
    scores_map_current_evict[evict_buffer->keys_ptr(false)[i]] =
        evict_buffer->scores_ptr(false)[i];
  }

  float dur = diff.count();

  size_t table_size_after = table->size(stream);
  size_t table_size_verify1 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

  ASSERT_EQ(table_size_verify1, table_size_after);

  size_t new_cap = table_size_after + filtered_len;
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_after * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_after * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_after * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,
                             filtered_len * sizeof(K), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,
                             evicted_values, filtered_len * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after, evicted_scores,
                             filtered_len * sizeof(S), cudaMemcpyDeviceToHost,
                             stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int64_t new_cap_i64 = (int64_t)new_cap;

  size_t key_miss_cnt = 0;
  size_t value_diff_cnt = 0;
  size_t score_error_cnt1 = 0;
  size_t score_error_cnt2 = 0;

  for (int64_t i = new_cap_i64 - 1; i >= 0; i--) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    values_map_after_insert[h_tmp_keys[i]] = *vec;
    scores_map_after_insert[h_tmp_keys[i]] = h_tmp_scores[i];
    if (i >= (new_cap_i64 - filtered_len)) {
      bool valid = ((h_tmp_scores[i] >> 32) < (global_epoch - 2));
      if (!valid) {
        score_error_cnt1++;
      }
    }
  }

  for (auto it : scores_map_current_batch) {
    const K key = it.first;
    const K score = it.second;
    S current_score = scores_map_after_insert[key];
    S score_before_insert = 0;
    if (scores_map_before_insert.find(key) != scores_map_before_insert.end() &&
        scores_map_current_evict.find(key) == scores_map_current_evict.end()) {
      score_before_insert = scores_map_before_insert[key];
    }
    bool valid = ((current_score >> 32) == global_epoch) &&
                 ((current_score & 0xFFFFFFFF) ==
                  ((0xFFFFFFFF & score_before_insert) + (0xFFFFFFFF & score)));

    if (!valid) {
      score_error_cnt2++;
    }
  }
  for (auto& it : values_map_before_insert) {
    if (values_map_after_insert.find(it.first) ==
        values_map_after_insert.end()) {
      ++key_miss_cnt;
      continue;
    }
    test_util::ValueArray<V, dim>& vec0 = it.second;
    test_util::ValueArray<V, dim>& vec1 = values_map_after_insert.at(it.first);
    for (size_t j = 0; j < dim; j++) {
      if (vec0[j] != vec1[j]) {
        ++value_diff_cnt;
        break;
      }
    }
  }
  std::cout << "Check insert_and_evict behavior got "
            << "key_miss_cnt: " << key_miss_cnt
            << ", value_diff_cnt: " << value_diff_cnt
            << ", score_error_cnt1: " << score_error_cnt1
            << ", score_error_cnt2: " << score_error_cnt2
            << ", while table_size_before: " << table_size_before
            << ", while table_size_after: " << table_size_after
            << ", while len: " << len << ", dur: " << dur << std::endl;

  ASSERT_EQ(key_miss_cnt, 0);
  ASSERT_EQ(value_diff_cnt, 0);
  ASSERT_EQ(score_error_cnt1, 0);
  ASSERT_EQ(score_error_cnt2, 0);

  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));
  free(h_tmp_keys);
  free(h_tmp_values);
  free(h_tmp_scores);
  free(h_tmp_founds);
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

void test_insert_and_evict_advanced_on_epochlfu() {
  const size_t U = 1024 * 1024;
  const size_t init_capacity = U;
  const size_t B = 128 * 1024;

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = U * dim * sizeof(f32);
  opt.max_bucket_size = 128;
  opt.num_of_buckets_per_alloc = 32;
  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kEpochLfu>;
  opt.dim = dim;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;
  evict_buffer.Reserve(B, dim, stream);
  evict_buffer.ToZeros(stream);

  test_util::KVMSBuffer<i64, f32, u64> data_buffer;
  test_util::KVMSBuffer<i64, f32, u64> pre_data_buffer;
  data_buffer.Reserve(B, dim, stream);
  pre_data_buffer.Reserve(B, dim, stream);

  size_t offset = 0;
  int freq_range = 100;
  float repeat_rate = 0.9;
  for (unsigned int global_epoch = 1; global_epoch <= 64; global_epoch++) {
    if (global_epoch <= 1) {
      test_util::create_random_keys_advanced<i64, u64, f32>(
          dim, data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),
          data_buffer.values_ptr(false), (int)B, B * 16, freq_range);
    } else {
      test_util::create_random_keys_advanced<i64, u64, f32>(
          dim, data_buffer.keys_ptr(false), pre_data_buffer.keys_ptr(false),
          data_buffer.scores_ptr(false), data_buffer.values_ptr(false), (int)B,
          B * 16, freq_range, repeat_rate);
    }
    data_buffer.SyncData(true, stream);
    if (global_epoch <= 1) {
      pre_data_buffer.CopyFrom(data_buffer, stream);
    }

    CheckInsertAndEvictOnEpochLfu<i64, f32, u64, Table>(
        table.get(), &data_buffer, &evict_buffer, &pre_data_buffer, B, stream,
        opt, global_epoch);

    pre_data_buffer.CopyFrom(data_buffer, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    offset += B;
  }
}

void test_insert_and_evict_advanced_on_customized() {
  const size_t U = 1024 * 1024;
  const size_t init_capacity = U;
  const size_t B = 100000;

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = U * dim * sizeof(f32);
  opt.max_bucket_size = 128;
  opt.num_of_buckets_per_alloc = 2;
  using Table =
      nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kCustomized>;
  opt.dim = dim;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;
  evict_buffer.Reserve(B, dim, stream);
  evict_buffer.ToZeros(stream);

  test_util::KVMSBuffer<i64, f32, u64> data_buffer;
  data_buffer.Reserve(B, dim, stream);

  size_t offset = 0;
  u64 score = 0;
  for (int i = 0; i < 32; i++) {
    test_util::create_random_keys<i64, u64, f32, dim>(
        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),
        data_buffer.values_ptr(false), (int)B, (int)B * 16);
    data_buffer.SyncData(true, stream);

    CheckInsertAndEvict<i64, f32, u64, Table>(
        table.get(), data_buffer.keys_ptr(), data_buffer.values_ptr(),
        data_buffer.scores_ptr(), evict_buffer.keys_ptr(),
        evict_buffer.values_ptr(), evict_buffer.scores_ptr(), B, stream, opt);

    offset += B;
    score += 1;
  }
}

void test_insert_and_evict_with_export_batch() {
  size_t max_capacity = 4096;
  size_t init_capacity = 2048;
  size_t offset = 0;
  size_t uplimit = 1048576;
  size_t len = 4096 + 13;

  TableOptions opt;
  opt.max_capacity = max_capacity;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = uplimit * dim * sizeof(f32);
  opt.num_of_buckets_per_alloc = 16;
  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kLru>;
  opt.dim = dim;

  using Vec_t = test_util::ValueArray<f32, dim>;
  std::map<i64, Vec_t> ref_map;
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> buffer;
  buffer.Reserve(len, dim, stream);
  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;
  evict_buffer.Reserve(len, dim, stream);

  size_t total_len = 0;
  buffer.ToRange(offset, /*skip=1*/ 1, stream);
  size_t n_evicted = table->insert_and_evict(
      len, buffer.keys_ptr(), buffer.values_ptr(), nullptr,
      evict_buffer.keys_ptr(), evict_buffer.values_ptr(), nullptr, stream);
  printf("Insert %zu keys and evict %zu\n", len, n_evicted);
  offset += len;
  total_len += len;
  evict_buffer.SyncData(/*h2d=*/false, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  for (size_t i = 0; i < n_evicted; i++) {
    Vec_t* vec =
        reinterpret_cast<Vec_t*>(evict_buffer.values_ptr(false) + i * dim);
    ref_map[evict_buffer.keys_ptr(false)[i]] = *vec;
  }

  offset = 0;
  size_t search_len = (table->capacity() >> 2);
  for (; offset < table->capacity(); offset += search_len) {
    if (offset + search_len > table->capacity()) {
      search_len = table->capacity() - offset;
    }
    size_t n_exported =
        table->export_batch(search_len, offset, buffer.keys_ptr(),
                            buffer.values_ptr(), /*scores=*/nullptr, stream);
    buffer.SyncData(/*h2d=*/false);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    for (size_t i = 0; i < n_exported; i++) {
      Vec_t* vec = reinterpret_cast<Vec_t*>(buffer.values_ptr(false) + i * dim);
      for (size_t j = 0; j < dim; j++) {
        ASSERT_EQ(buffer.keys_ptr(false)[i], vec->operator[](j));
      }
      ref_map[buffer.keys_ptr(false)[i]] = *vec;
    }
  }

  for (auto& it : ref_map) {
    for (size_t j = 0; j < dim; j++) {
      ASSERT_EQ(static_cast<f32>(it.first), it.second.data[j]);
    }
  }
}

template <typename K, typename V, typename S, typename Table>
void BatchCheckInsertAndEvict(Table* table, K* keys, V* values, S* scores,
                              K* evicted_keys, V* evicted_values,
                              S* evicted_scores, size_t len,
                              std::atomic<int>* step, size_t total_step,
                              cudaStream_t stream, bool if_check = true) {
  std::map<i64, test_util::ValueArray<f32, dim>> map_before_insert;
  std::map<i64, test_util::ValueArray<f32, dim>> map_after_insert;

  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;

  while (step->load() < total_step) {
    size_t table_size_before = table->size(stream);
    size_t cap = table_size_before + len;
    size_t key_miss_cnt = 0;
    size_t value_diff_cnt = 0;
    size_t table_size_after = 0;
    size_t table_size_verify1 = 0;

    int s = step->load();

    if (if_check) {
      CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
      CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
      CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
      CUDA_CHECK(cudaStreamSynchronize(stream));

      h_tmp_keys = (K*)malloc(cap * sizeof(K));
      h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
      h_tmp_scores = (S*)malloc(cap * sizeof(S));

      CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
      CUDA_CHECK(
          cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
      CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));

      size_t table_size_verify0 = table->export_batch(
          table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);
      ASSERT_EQ(table_size_before, table_size_verify0);

      CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                                 table_size_before * sizeof(K),
                                 cudaMemcpyDeviceToHost, stream));
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                                 table_size_before * dim * sizeof(V),
                                 cudaMemcpyDeviceToHost, stream));
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                                 table_size_before * sizeof(S),
                                 cudaMemcpyDeviceToHost, stream));

      CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_before, keys + len * s,
                                 len * sizeof(K), cudaMemcpyDeviceToHost,
                                 stream));
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_before * dim,
                                 values + len * s * dim, len * dim * sizeof(V),
                                 cudaMemcpyDeviceToHost, stream));
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_before,
                                 scores + len * s, len * sizeof(S),
                                 cudaMemcpyDeviceToHost, stream));
      CUDA_CHECK(cudaStreamSynchronize(stream));

      for (size_t i = 0; i < cap; i++) {
        test_util::ValueArray<V, dim>* vec =
            reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                             i * dim);
        map_before_insert[h_tmp_keys[i]] = *vec;
      }
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    auto start = std::chrono::steady_clock::now();
    size_t filtered_len = table->insert_and_evict(
        len, keys + len * s, values + len * s * dim, nullptr, evicted_keys,
        evicted_values, evicted_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    auto end = std::chrono::steady_clock::now();
    auto diff =
        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

    float dur = diff.count();

    if (if_check) {
      table_size_after = table->size(stream);
      table_size_verify1 = table->export_batch(
          table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

      ASSERT_EQ(table_size_verify1, table_size_after);

      size_t new_cap = table_size_after + filtered_len;
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                                 table_size_after * sizeof(K),
                                 cudaMemcpyDeviceToHost, stream));
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                                 table_size_after * dim * sizeof(V),
                                 cudaMemcpyDeviceToHost, stream));
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                                 table_size_after * sizeof(S),
                                 cudaMemcpyDeviceToHost, stream));
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys + table_size_after, evicted_keys,
                                 filtered_len * sizeof(K),
                                 cudaMemcpyDeviceToHost, stream));
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_values + table_size_after * dim,
                                 evicted_values, filtered_len * dim * sizeof(V),
                                 cudaMemcpyDeviceToHost, stream));
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores + table_size_after,
                                 evicted_scores, filtered_len * sizeof(S),
                                 cudaMemcpyDeviceToHost, stream));
      CUDA_CHECK(cudaStreamSynchronize(stream));
      int64_t new_cap_i64 = (int64_t)new_cap;
      for (int64_t i = new_cap_i64 - 1; i >= 0; i--) {
        test_util::ValueArray<V, dim>* vec =
            reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                             i * dim);
        map_after_insert[h_tmp_keys[i]] = *vec;
      }

      for (auto& it : map_before_insert) {
        if (map_after_insert.find(it.first) == map_after_insert.end()) {
          ++key_miss_cnt;
          continue;
        }
        test_util::ValueArray<V, dim>& vec0 = it.second;
        test_util::ValueArray<V, dim>& vec1 = map_after_insert.at(it.first);
        for (size_t j = 0; j < dim; j++) {
          if (vec0[j] != vec1[j]) {
            ++value_diff_cnt;
            break;
          }
        }
      }
      ASSERT_EQ(key_miss_cnt, 0);
      ASSERT_EQ(value_diff_cnt, 0);

      CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
      CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
      CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
      free(h_tmp_keys);
      free(h_tmp_values);
      free(h_tmp_scores);
      CUDA_CHECK(cudaStreamSynchronize(stream));
    }

    std::cout << "Check insert behavior got step: " << step->load()
              << ",\tduration: " << dur
              << ",\twhile value_diff_cnt: " << value_diff_cnt
              << ", while table_size_before: " << table_size_before
              << ", while table_size_after: " << table_size_after
              << ", while len: " << len << std::endl;

    step->fetch_add(1);
  }
}

template <typename K, typename V, typename S, typename Table>
void BatchCheckFind(Table* table, K* keys, V* values, S* scores, size_t len,
                    std::atomic<int>* step, size_t total_step,
                    size_t find_interval, cudaStream_t stream,
                    bool if_check = true) {
  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;
  bool* h_tmp_founds = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;
  bool* d_tmp_founds = nullptr;

  int find_step = 0;
  size_t cap = len * find_interval;

  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_founds, cap * sizeof(bool), stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));

  h_tmp_keys = (K*)malloc(cap * sizeof(K));
  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
  h_tmp_scores = (S*)malloc(cap * sizeof(S));
  h_tmp_founds = (bool*)malloc(cap * sizeof(bool));

  while (step->load() < total_step) {
    while (find_step >= (step->load() / find_interval)) continue;

    size_t found_num = 0;
    size_t value_diff_cnt = 0;

    CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
    CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
    CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));
    CUDA_CHECK(cudaMemsetAsync(d_tmp_founds, 0, cap * sizeof(bool), stream));

    CUDA_CHECK(cudaMemcpyAsync(d_tmp_keys, keys + cap * find_step,
                               cap * sizeof(K), cudaMemcpyDeviceToDevice,
                               stream));

    CUDA_CHECK(cudaStreamSynchronize(stream));

    auto start = std::chrono::steady_clock::now();
    table->find(cap, d_tmp_keys, d_tmp_values, d_tmp_founds, d_tmp_scores,
                stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    auto end = std::chrono::steady_clock::now();
    auto diff =
        std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

    float dur = diff.count();

    if (if_check) {
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys, cap * sizeof(K),
                                 cudaMemcpyDeviceToHost, stream));
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                                 cap * dim * sizeof(V), cudaMemcpyDeviceToHost,
                                 stream));
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores, cap * sizeof(S),
                                 cudaMemcpyDeviceToHost, stream));
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, cap * sizeof(bool),
                                 cudaMemcpyDeviceToHost, stream));

      CUDA_CHECK(cudaStreamSynchronize(stream));

      for (int i = 0; i < cap; i++) {
        if (h_tmp_founds[i]) {
          for (int j = 0; j < dim; j++) {
            if (h_tmp_values[i * dim + j] !=
                static_cast<float>(h_tmp_keys[i] * 0.00001)) {
              value_diff_cnt++;
            };
          }
          found_num++;
        }
      }
      ASSERT_EQ(value_diff_cnt, 0);

      CUDA_CHECK(cudaMemset(d_tmp_founds, 0, cap * sizeof(bool)));
      table->contains(cap, keys, d_tmp_founds, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      int contains_num = 0;
      CUDA_CHECK(cudaMemcpyAsync(h_tmp_founds, d_tmp_founds, cap * sizeof(bool),
                                 cudaMemcpyDeviceToHost, stream));
      for (int i = 0; i < cap; i++) {
        if (h_tmp_founds[i]) contains_num++;
      }
      ASSERT_EQ(contains_num, found_num);
    }
    std::cout << std::endl
              << "\nCheck find behavior got step: " << find_step
              << ",\tduration: " << dur
              << ",\twhile value_diff_cnt: " << value_diff_cnt
              << ", while cap: " << cap << std::endl
              << std::endl;
    ASSERT_EQ(value_diff_cnt, 0);
    find_step++;
  }
  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_founds, stream));
  free(h_tmp_keys);
  free(h_tmp_values);
  free(h_tmp_scores);
  free(h_tmp_founds);

  CUDA_CHECK(cudaStreamSynchronize(stream));
}

void test_insert_and_evict_bugfix_no_zero_eviction() {
  size_t max_capacity = 2048;
  size_t init_capacity = 2048;
  size_t remove_len = 1024;
  size_t insert_len = 2048;
  
  TableOptions opt;
  opt.max_capacity = max_capacity;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = 1024 * 1024 * sizeof(f32); 
  opt.num_of_buckets_per_alloc = 16;
  opt.dim = dim;

  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kLru>;
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> buffer_init;
  buffer_init.Reserve(max_capacity, dim, stream);
  buffer_init.ToRange(1, 1, stream); 
  
  test_util::KVMSBuffer<i64, f32, u64> evict_buffer_init;
  evict_buffer_init.Reserve(max_capacity, dim, stream);

  size_t n_evicted = table->insert_and_evict(
      max_capacity, buffer_init.keys_ptr(), buffer_init.values_ptr(), nullptr,
      evict_buffer_init.keys_ptr(), evict_buffer_init.values_ptr(), nullptr, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  table->erase(remove_len, buffer_init.keys_ptr(), stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  test_util::KVMSBuffer<i64, f32, u64> buffer_new;
  buffer_new.Reserve(insert_len, dim, stream);
  buffer_new.ToRange(3000, 1, stream); 

  test_util::KVMSBuffer<i64, f32, u64> evict_buffer_new;
  evict_buffer_new.Reserve(insert_len, dim, stream);
  
  CUDA_CHECK(cudaMemsetAsync(evict_buffer_new.keys_ptr(), 0, insert_len * sizeof(i64), stream));

  n_evicted = table->insert_and_evict(
      insert_len, buffer_new.keys_ptr(), buffer_new.values_ptr(), nullptr,
      evict_buffer_new.keys_ptr(), evict_buffer_new.values_ptr(), nullptr, stream);
      
  evict_buffer_new.SyncData(/*h2d=*/false, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  printf("Second insert evicted %zu keys\n", n_evicted);

  for (size_t i = 0; i < n_evicted; i++) {
    i64 evicted_key = evict_buffer_new.keys_ptr(false)[i];
    ASSERT_NE(evicted_key, 0) << "Found 0 (uninitialized empty key) at evict index " << i;
  }
}

void test_insert_and_evict_run_with_batch_find() {
  const size_t U = 16 * 1024 * 1024;
  const size_t init_capacity = U;
  const size_t B = 256 * 1024;
  constexpr size_t batch_num = 256;
  constexpr size_t find_interval = 8;

  const bool if_check = false;

  std::thread insert_and_evict_thread;
  std::thread find_thread;
  std::atomic<int> step{0};

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = U * dim * sizeof(f32);
  opt.num_of_buckets_per_alloc = 128;
  using Table = nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kLru>;
  opt.dim = dim;

  cudaStream_t insert_stream;
  cudaStream_t find_stream;
  CUDA_CHECK(cudaStreamCreate(&insert_stream));
  CUDA_CHECK(cudaStreamCreate(&find_stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<i64, f32, u64> global_buffer;
  global_buffer.Reserve(B * batch_num, dim, insert_stream);

  test_util::KVMSBuffer<i64, f32, u64> evict_buffer;
  evict_buffer.Reserve(B, dim, insert_stream);
  evict_buffer.ToZeros(insert_stream);

  for (int i = 0; i < batch_num; i++) {
    test_util::create_random_keys<i64, u64, f32, dim>(
        global_buffer.keys_ptr(false) + B * i,
        global_buffer.scores_ptr(false) + B * i,
        global_buffer.values_ptr(false) + B * i * dim, (int)B);
  }
  global_buffer.SyncData(true, insert_stream);
  CUDA_CHECK(cudaStreamSynchronize(insert_stream));

  auto insert_and_evict_func = [&table, &global_buffer, &evict_buffer, &B,
                                &step, &batch_num, &insert_stream]() {
    BatchCheckInsertAndEvict<i64, f32, u64, Table>(
        table.get(), global_buffer.keys_ptr(), global_buffer.values_ptr(),
        global_buffer.scores_ptr(), evict_buffer.keys_ptr(),
        evict_buffer.values_ptr(), evict_buffer.scores_ptr(), B, &step,
        batch_num, insert_stream, if_check);
  };

  auto find_func = [&table, &global_buffer, &B, &step, &batch_num,
                    &find_interval, &find_stream]() {
    BatchCheckFind<i64, f32, u64, Table>(
        table.get(), global_buffer.keys_ptr(), global_buffer.values_ptr(),
        global_buffer.scores_ptr(), B, &step, batch_num, find_interval,
        find_stream, if_check);
  };

  find_thread = std::thread(find_func);
  insert_and_evict_thread = std::thread(insert_and_evict_func);
  find_thread.join();
  insert_and_evict_thread.join();
  CUDA_CHECK(cudaStreamDestroy(insert_stream));
  CUDA_CHECK(cudaStreamDestroy(find_stream));
}

TEST(InsertAndEvictTest, test_insert_and_evict_basic) {
  test_insert_and_evict_basic();
}

TEST(InsertAndEvictTest, test_insert_and_evict_advanced_on_lru) {
  test_insert_and_evict_advanced_on_lru();
}

TEST(InsertAndEvictTest, test_insert_and_evict_advanced_on_lfu) {
  test_insert_and_evict_advanced_on_lfu();
}

TEST(InsertAndEvictTest, test_insert_and_evict_advanced_on_epochlru) {
  test_insert_and_evict_advanced_on_epochlru();
}

TEST(InsertAndEvictTest, test_insert_and_evict_advanced_on_epochlfu) {
  test_insert_and_evict_advanced_on_epochlfu();
}

TEST(InsertAndEvictTest, test_insert_and_evict_advanced_on_customized) {
  test_insert_and_evict_advanced_on_customized();
}

TEST(InsertAndEvictTest, test_insert_and_evict_with_export_batch) {
  test_insert_and_evict_with_export_batch();
}

TEST(InsertAndEvictTest, test_insert_and_evict_run_with_batch_find) {
  test_insert_and_evict_run_with_batch_find();
}

TEST(InsertAndEvictTest, test_insert_and_evict_bugfix_no_zero_eviction) {
  test_insert_and_evict_bugfix_no_zero_eviction();
}


================================================
FILE: tests/lock_unlock_test.cc.cu
================================================
/*
 * Copyright (c) 2025, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <gtest/gtest.h>
#include <stdio.h>
#include <array>
#include <map>
#include <unordered_map>
#include "merlin/types.cuh"
#include "merlin_hashtable.cuh"
#include "merlin_localfile.hpp"
#include "test_util.cuh"

constexpr size_t dim = 64;
using i64 = int64_t;
using u64 = uint64_t;
using f32 = float;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;

void test_lock_and_unlock() {
  TableOptions opt;

  // table setting
  const size_t U = 4 * 1024 * 1024UL;
  const size_t M = 65536UL;
  opt.max_capacity = U;
  opt.init_capacity = U;
  opt.max_hbm_for_vectors = U * dim * sizeof(f32);
  opt.num_of_buckets_per_alloc = 8;

  using Table =
      nv::merlin::HashTable<i64, f32, u64, EvictStrategy::kCustomized>;
  opt.dim = dim;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));
  bool *d_found, *d_lock_results;
  i64** lock_keys_ptr;
  CUDA_CHECK(cudaMalloc(&d_found, M * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_lock_results, M * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&lock_keys_ptr, M * sizeof(i64*)));

  // step1
  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  // step2
  test_util::KVMSBuffer<i64, f32, u64> buffer;
  buffer.Reserve(M, dim, stream);

  i64 start = 0;
  for (int i = 0; i < U / M; i++) {
    buffer.ToRange(start, 1, stream);
    start += M;
    buffer.Setscore((u64)i, stream);
    table->insert_or_assign(M, buffer.keys_ptr(), buffer.values_ptr(),
                            buffer.scores_ptr(), stream);

    CUDA_CHECK(cudaMemsetAsync(d_found, 0, M * sizeof(bool), stream));
    CUDA_CHECK(cudaMemsetAsync(d_lock_results, 0, M * sizeof(bool), stream));
    table->contains(M, buffer.keys_ptr(), d_found, stream);
    table->lock_keys(M, buffer.keys_ptr(), lock_keys_ptr, d_lock_results,
                     stream, buffer.scores_ptr());
    bool result = test_util::allEqualGpu(d_found, d_lock_results, M, stream);
    ASSERT_EQ(result, true);
    result = test_util::allTrueGpu(d_found, M, stream);
    ASSERT_EQ(result, true);

    CUDA_CHECK(cudaMemsetAsync(d_found, 0, M * sizeof(bool), stream));
    CUDA_CHECK(cudaMemsetAsync(d_lock_results, 0, M * sizeof(bool), stream));
    table->contains(M, buffer.keys_ptr(), d_found, stream);
    result = test_util::allEqualGpu(d_found, d_lock_results, M, stream);
    ASSERT_EQ(result, true);

    CUDA_CHECK(cudaMemsetAsync(d_found, 0, M * sizeof(bool), stream));
    table->unlock_keys(M, lock_keys_ptr, buffer.keys_ptr(), d_lock_results,
                       stream);
    table->contains(M, buffer.keys_ptr(), d_found, stream);
    result = test_util::allEqualGpu(d_found, d_lock_results, M, stream);
    ASSERT_EQ(result, true);
    result = test_util::allTrueGpu(d_found, M, stream);
    ASSERT_EQ(result, true);
  }

  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaFree(d_lock_results));
  CUDA_CHECK(cudaFree(lock_keys_ptr));
}

TEST(LockAndUnlockTest, test_lock_and_unlock) { test_lock_and_unlock(); }

================================================
FILE: tests/memory_pool_test.cc.cu
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <cuda_runtime_api.h>
#include <gtest/gtest.h>
#include <iostream>
#include "merlin/allocator.cuh"
#include "merlin/memory_pool.cuh"

using namespace nv::merlin;

/**
 * Wrapper around another allocator that prints debug messages.
 */
template <class Allocator>
struct DebugAllocator final
    : AllocatorBase<typename Allocator::type, DebugAllocator<Allocator>> {
  using type = typename Allocator::type;

  static constexpr const char* name{"DebugAllocator"};

  inline static type* alloc(size_t n, BaseAllocator* allocator,
                            cudaStream_t stream = 0) {
    type* ptr{Allocator::alloc(n, allocator, stream)};
    std::cout << Allocator::name << "[type_name = " << typeid(type).name()
              << "]: " << static_cast<void*>(ptr) << " allocated = " << n
              << " x " << sizeof(type) << " bytes, stream = " << stream << '\n';
    return ptr;
  }

  inline static void free(type* ptr, BaseAllocator* allocator,
                          cudaStream_t stream = 0) {
    Allocator::free(ptr, allocator, stream);
    std::cout << Allocator::name << "[type_name = " << typeid(type).name()
              << "]: " << static_cast<void*>(ptr)
              << " freed, stream = " << stream << '\n';
  }
};

void print_divider() {
  for (size_t i{0}; i < 80; ++i) std::cout << '-';
  std::cout << '\n';
}

void print_pool_options(const MemoryPoolOptions& opt) {
  print_divider();
  std::cout << "Memory Pool Configuration\n";
  print_divider();
  std::cout << "opt.max_stock   : " << opt.max_stock << " buffers\n";
  std::cout << "opt.max_pending : " << opt.max_pending << " buffers\n";
  print_divider();
  std::cout.flush();
}

MemoryPoolOptions opt{
    3,  //< max_stock
    5,  //< max_pending
};

struct SomeType {
  int a;
  float b;

  friend std::ostream& operator<<(std::ostream&, const SomeType&);
};

std::ostream& operator<<(std::ostream& os, const SomeType& obj) {
  cudaPointerAttributes attr;
  CUDA_CHECK(cudaPointerGetAttributes(&attr, &obj));

  SomeType tmp;
  if (attr.type == cudaMemoryTypeDevice) {
    CUDA_CHECK(
        cudaMemcpy(&tmp, &obj, sizeof(SomeType), cudaMemcpyDeviceToHost));
  } else {
    tmp = obj;
  }

  os << "a = " << tmp.a << ", b = " << tmp.b;
  return os;
}

void test_standard_allocator() {
  using Allocator = DebugAllocator<StandardAllocator<SomeType>>;
  std::shared_ptr<DefaultAllocator> default_allocator(new DefaultAllocator());

  {
    auto ptr{Allocator::make_unique(1, default_allocator.get())};
    ASSERT_NE(ptr.get(), nullptr);

    std::cout << "Sync UPtr after alloc: " << *ptr << std::endl;
    ptr->a = 47;
    ptr->b = 11;
    std::cout << "Sync UPtr after set: " << *ptr << std::endl;

    ptr.reset();
    ASSERT_EQ(ptr.get(), nullptr);
  }

  {
    auto ptr{Allocator::make_unique(1, default_allocator.get(), nullptr)};
    ASSERT_NE(ptr.get(), nullptr);

    std::cout << "Async UPtr after alloc: " << *ptr << std::endl;
    ptr->a = 47;
    ptr->b = 11;
    std::cout << "Async UPtr after set: " << *ptr << std::endl;

    ptr.reset();
    ASSERT_EQ(ptr.get(), nullptr);
  }

  {
    auto ptr{Allocator::make_shared(1, default_allocator.get())};
    ASSERT_NE(ptr.get(), nullptr);

    std::cout << "SPtr after alloc: " << *ptr << std::endl;
    ptr->a = 47;
    ptr->b = 11;
    std::cout << "SPtr after set: " << *ptr << std::endl;

    ptr.reset();
    ASSERT_EQ(ptr.get(), nullptr);
  }
}

void test_host_allocator() {
  using Allocator = DebugAllocator<HostAllocator<SomeType>>;
  std::shared_ptr<DefaultAllocator> default_allocator(new DefaultAllocator());

  {
    auto ptr{Allocator::make_unique(1, default_allocator.get())};
    ASSERT_NE(ptr.get(), nullptr);

    std::cout << "Sync UPtr after alloc: " << *ptr << std::endl;
    ptr->a = 47;
    ptr->b = 11;
    std::cout << "Sync UPtr after set: " << *ptr << std::endl;

    ptr.reset();
    ASSERT_EQ(ptr.get(), nullptr);
  }

  {
    auto ptr{Allocator::make_unique(1, default_allocator.get(), nullptr)};
    ASSERT_NE(ptr.get(), nullptr);

    std::cout << "Async UPtr after alloc: " << *ptr << std::endl;
    ptr->a = 47;
    ptr->b = 11;
    std::cout << "Async UPtr after set: " << *ptr << std::endl;

    ptr.reset();
    ASSERT_EQ(ptr.get(), nullptr);
  }

  {
    auto ptr{Allocator::make_shared(1, default_allocator.get())};
    ASSERT_NE(ptr.get(), nullptr);

    std::cout << "SPtr after alloc: " << *ptr << std::endl;
    ptr->a = 47;
    ptr->b = 11;
    std::cout << "SPtr after set: " << *ptr << std::endl;

    ptr.reset();
    ASSERT_EQ(ptr.get(), nullptr);
  }
}

void test_device_allocator() {
  using Allocator = DebugAllocator<DeviceAllocator<SomeType>>;
  std::shared_ptr<DefaultAllocator> default_allocator(new DefaultAllocator());

  int num_devices;
  CUDA_CHECK(cudaGetDeviceCount(&num_devices));
  MERLIN_CHECK(num_devices > 0,
               "Need at least one CUDA capable device for running this test.");

  CUDA_CHECK(cudaSetDevice(num_devices - 1));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  {
    auto ptr{Allocator::make_unique(1, default_allocator.get())};
    ASSERT_NE(ptr.get(), nullptr);

    std::cout << "Sync UPtr after alloc: " << *ptr << std::endl;
    const SomeType tmp{47, 11};

    std::cout << "Sync UPtr after alloc get ptr: " << ptr.get() << std::endl;
    CUDA_CHECK(cudaMemset(ptr.get(), 0, sizeof(SomeType)));
    CUDA_CHECK(
        cudaMemcpy(ptr.get(), &tmp, sizeof(SomeType), cudaMemcpyHostToDevice));
    std::cout << "Sync UPtr after set: " << *ptr << std::endl;

    ptr.reset();
    ASSERT_EQ(ptr.get(), nullptr);
  }

  {
    auto ptr{Allocator::make_unique(1, default_allocator.get(), stream)};
    ASSERT_NE(ptr.get(), nullptr);

    std::cout << "Async UPtr after alloc: " << *ptr << std::endl;
    const SomeType tmp{47, 11};
    CUDA_CHECK(
        cudaMemcpy(ptr.get(), &tmp, sizeof(SomeType), cudaMemcpyHostToDevice));
    std::cout << "Async UPtr after set: " << *ptr << std::endl;

    ptr.reset();
    ASSERT_EQ(ptr.get(), nullptr);
  }

  {
    auto ptr{Allocator::make_shared(1, default_allocator.get(), stream)};
    ASSERT_NE(ptr.get(), nullptr);

    std::cout << "SPtr after alloc: " << *ptr << std::endl;
    const SomeType tmp{47, 11};
    CUDA_CHECK(
        cudaMemcpy(ptr.get(), &tmp, sizeof(SomeType), cudaMemcpyHostToDevice));
    std::cout << "SPtr after set: " << *ptr << std::endl;

    ptr.reset();
    ASSERT_EQ(ptr.get(), nullptr);
  }

  CUDA_CHECK(cudaStreamDestroy(stream));
}

void test_borrow_return_with_context(const bool use_custom_stream) {
  int num_devices;
  CUDA_CHECK(cudaGetDeviceCount(&num_devices));
  MERLIN_CHECK(num_devices > 0,
               "Need at least one CUDA capable device for running this test.");
  CUDA_CHECK(cudaSetDevice(0));

  cudaStream_t stream{0};
  if (use_custom_stream) {
    CUDA_CHECK(cudaStreamCreate(&stream));
  }

  std::shared_ptr<DefaultAllocator> default_allocator(new DefaultAllocator());
  {
    MemoryPool<DebugAllocator<DeviceAllocator<SomeType>>> pool(
        opt, default_allocator.get());
    const size_t buffer_size{256L * 1024};

    // Initial status.
    std::cout << ".:: Initial state ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 0);
    ASSERT_EQ(pool.num_pending(), 0);

    // Borrow and return one buffer (unique ptr).
    {
      auto buffer{pool.get_unique(buffer_size, stream)};
      std::cout << ".:: Borrow 1 (unique) ::.\n" << pool << std::endl;
      ASSERT_EQ(pool.current_stock(), 0);
      ASSERT_EQ(pool.num_pending(), 0);
    }
    std::cout << ".:: Return 1 (unique) ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 0);
    ASSERT_EQ(pool.num_pending(), 1);

    // Await unfinished GPU work (ensure stable situation).
    pool.await_pending(stream);
    std::cout << ".:: Await pending ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 1);
    ASSERT_EQ(pool.num_pending(), 0);

    // Borrow and return one buffer (shared ptr).
    {
      auto buffer{pool.get_shared(buffer_size, stream)};
      std::cout << ".:: Borrow 1 (shared) ::.\n" << pool << std::endl;
      ASSERT_EQ(pool.current_stock(), 0);
      ASSERT_EQ(pool.num_pending(), 0);
    }
    std::cout << ".:: Return 1 (shared) ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 0);
    ASSERT_EQ(pool.num_pending(), 1);

    // Await unfinished GPU work (ensure stable situation).
    pool.await_pending(stream);
    std::cout << ".:: Await pending ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 1);
    ASSERT_EQ(pool.num_pending(), 0);

    // Borrow static workspace with less than `max_stock` buffers.
    {
      auto ws{pool.get_workspace<2>(buffer_size, stream)};
      std::cout << ".:: Borrow 2 (static) ::.\n" << pool << std::endl;
      ASSERT_EQ(pool.current_stock(), 0);
      ASSERT_EQ(pool.num_pending(), 0);
    }
    std::cout << ".:: Return 2 (static) ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 0);
    ASSERT_EQ(pool.num_pending(), 2);

    // Await unfinished GPU work (ensure stable situation).
    pool.await_pending(stream);
    std::cout << ".:: Await pending ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 2);
    ASSERT_EQ(pool.num_pending(), 0);

    // Borrow workspace that exceeds base pool size. Possible results:
    // 1. If this thread is slower than the driver.
    //    Upon return we will see a partial deallocation before inserting the
    //    last buffer into the pending queue.
    // 2. If this the driver is slower than this thread queuing/querying events.
    //    Either 0-3 buffers in stock partial dallocation
    //    1-5 buffers pending. Hence there is no good way to check.
    {
      auto ws{pool.get_workspace<6>(buffer_size, stream)};
      std::cout << ".:: Borrow 6 (static) ::.\n" << pool << std::endl;
      ASSERT_EQ(pool.current_stock(), 0);
      ASSERT_EQ(pool.num_pending(), 0);
    }
    std::cout << ".:: Return 6 (static) ::.\n" << pool << std::endl;
    ASSERT_GE(pool.num_pending(), 1);

    // Await unfinished GPU work (ensure stable situation).
    pool.await_pending(stream);
    std::cout << ".:: Await pending ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 3);
    ASSERT_EQ(pool.num_pending(), 0);

    // Pin 1 and deplete stock.
    {
      auto ws{pool.get_workspace<1>(buffer_size, stream)};
      pool.deplete_stock();
      std::cout << ".:: Deplete stock ::.\n" << pool << std::endl;
      ASSERT_EQ(pool.current_stock(), 0);
      ASSERT_EQ(pool.num_pending(), 0);
    }
    std::cout << ".:: Deplete stock ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 0);
    ASSERT_EQ(pool.num_pending(), 1);

    // Await unfinished GPU work (ensure stable situation).
    pool.await_pending(stream);
    std::cout << ".:: Await pending ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 1);
    ASSERT_EQ(pool.num_pending(), 0);

    // Increase stock to 3 buffers.
    { auto ws{pool.get_workspace<3>(buffer_size, stream)}; }
    pool.await_pending(stream);
    ASSERT_EQ(pool.current_stock(), 3);
    ASSERT_EQ(pool.num_pending(), 0);

    // Pin 1 of the 3 buffers and release it to make it pending.
    { auto ws{pool.get_workspace<1>(buffer_size, stream)}; }
    ASSERT_EQ(pool.current_stock(), 2);
    ASSERT_EQ(pool.num_pending(), 1);
    std::cout << ".:: Ensure 2 stock + 1 pending situation ::.\n"
              << pool << std::endl;

    // Borrow a buffer that is smaller than the current buffer size.
    {
      auto ws{pool.get_unique(buffer_size / 2, stream)};
      std::cout << ".:: Borrow 1 (smaller) ::.\n" << pool << std::endl;
      ASSERT_EQ(pool.current_stock(), 1);
      ASSERT_EQ(pool.num_pending(), 1);
    }
    std::cout << ".:: Return 1 (smaller) ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 1);
    ASSERT_EQ(pool.num_pending(), 2);

    // Borrow a buffer that is bigger than the current buffer size. This will
    // evict the stock buffers which are smaller, but will not concern the
    // buffers that are still pending.
    {
      auto ws{pool.get_unique(buffer_size + 37, stream)};
      std::cout << ".:: Borrow 1 (bigger) ::.\n" << pool << std::endl;
      ASSERT_EQ(pool.current_stock(), 0);
      ASSERT_EQ(pool.num_pending(), 2);
    }
    std::cout << ".:: Return 1 (bigger) ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 0);
    ASSERT_EQ(pool.num_pending(), 3);

    // Because there are now pending buffers that are too small, they will be
    // cleared once the associated work has been completed.
    pool.await_pending(stream);
    std::cout << ".:: Await pending ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 1);
    ASSERT_EQ(pool.num_pending(), 0);
  }

  if (stream) {
    CUDA_CHECK(cudaStreamDestroy(stream));
  }
}

void test_borrow_return_lost_context() {
  int num_devices;
  CUDA_CHECK(cudaGetDeviceCount(&num_devices));
  MERLIN_CHECK(num_devices > 0,
               "Need at least one CUDA capable device for running this test.");
  CUDA_CHECK(cudaSetDevice(0));

  std::shared_ptr<DefaultAllocator> default_allocator(new DefaultAllocator());
  {
    MemoryPool<DebugAllocator<DeviceAllocator<SomeType>>> pool{
        opt, default_allocator.get()};
    const size_t buffer_size{256L * 1024};

    // Initial status.
    std::cout << ".:: Initial state ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 0);
    ASSERT_EQ(pool.num_pending(), 0);

    // Borrow and return one buffer (unique ptr).
    {
      cudaStream_t stream;
      CUDA_CHECK(cudaStreamCreate(&stream));

      auto buffer{pool.get_unique(buffer_size, stream)};
      std::cout << ".:: Borrow 1 (unique) ::.\n" << pool << std::endl;
      ASSERT_EQ(pool.current_stock(), 0);
      ASSERT_EQ(pool.num_pending(), 0);

      CUDA_CHECK(cudaStreamDestroy(stream));
    }
    std::cout << ".:: Return 1 (unique) ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 1);
    ASSERT_EQ(pool.num_pending(), 0);

    // Borrow and return one buffer (shared ptr).
    {
      cudaStream_t stream;
      CUDA_CHECK(cudaStreamCreate(&stream));

      auto buffer{pool.get_shared(buffer_size)};
      std::cout << ".:: Borrow 1 (shared) ::.\n" << pool << std::endl;
      ASSERT_EQ(pool.current_stock(), 0);
      ASSERT_EQ(pool.num_pending(), 0);

      CUDA_CHECK(cudaStreamDestroy(stream));
    }
    std::cout << ".:: Return 1 (shared) ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 1);
    ASSERT_EQ(pool.num_pending(), 0);

    // Borrow static workspace with less than `max_stock` buffers.
    {
      cudaStream_t stream;
      CUDA_CHECK(cudaStreamCreate(&stream));

      auto ws{pool.get_workspace<2>(buffer_size)};
      std::cout << ".:: Borrow 2 (static) ::.\n" << pool << std::endl;
      ASSERT_EQ(pool.current_stock(), 0);
      ASSERT_EQ(pool.num_pending(), 0);

      CUDA_CHECK(cudaStreamDestroy(stream));
    }
    std::cout << ".:: Return 2 (static) ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 2);
    ASSERT_EQ(pool.num_pending(), 0);

    // Borrow dynamic workspace with less than `max_stock` buffers.
    {
      cudaStream_t stream;
      CUDA_CHECK(cudaStreamCreate(&stream));

      auto ws{pool.get_workspace(2, buffer_size)};
      std::cout << ".:: Borrow 2 (dynamic) ::.\n" << pool << std::endl;
      ASSERT_EQ(pool.current_stock(), 0);
      ASSERT_EQ(pool.num_pending(), 0);

      CUDA_CHECK(cudaStreamDestroy(stream));
    }

    std::cout << ".:: Return 2 (dynamic) ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 2);
    ASSERT_EQ(pool.num_pending(), 0);

    // Await unfinished GPU work (shouldn't change anything).
    pool.await_pending();
    std::cout << ".:: Await pending (shouldn't change anything) ::.\n"
              << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 2);
    ASSERT_EQ(pool.num_pending(), 0);

    // Borrow workspace that exceeds base pool size.
    {
      cudaStream_t stream;
      CUDA_CHECK(cudaStreamCreate(&stream));

      auto ws{pool.get_workspace<6>(buffer_size)};
      std::cout << ".:: Borrow 6 (static) ::.\n" << pool << std::endl;
      ASSERT_EQ(pool.current_stock(), 0);
      ASSERT_EQ(pool.num_pending(), 0);

      CUDA_CHECK(cudaStreamDestroy(stream));
    }
    std::cout << ".:: Return 6 (static) ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), opt.max_stock);
    ASSERT_EQ(pool.num_pending(), 0);

    // Borrow a buffer that is smaller than the current buffer size.
    {
      cudaStream_t stream;
      CUDA_CHECK(cudaStreamCreate(&stream));

      auto ws{pool.get_unique(buffer_size / 2)};
      std::cout << ".:: Borrow 1 (smaller) ::.\n" << pool << std::endl;
      ASSERT_EQ(pool.current_stock(), opt.max_stock - 1);
      ASSERT_EQ(pool.num_pending(), 0);

      CUDA_CHECK(cudaStreamDestroy(stream));
    }
    std::cout << ".:: Return 1 (smaller) ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), opt.max_stock);
    ASSERT_EQ(pool.num_pending(), 0);

    // Borrow a buffer that is bigger than the current buffer size.
    {
      cudaStream_t stream;
      CUDA_CHECK(cudaStreamCreate(&stream));

      auto ws{pool.get_unique(buffer_size + 37)};
      std::cout << ".:: Borrow 1 (bigger) ::.\n" << pool << std::endl;
      ASSERT_EQ(pool.current_stock(), 0);
      ASSERT_EQ(pool.num_pending(), 0);

      CUDA_CHECK(cudaStreamDestroy(stream));
    }
    std::cout << ".:: Return 1 (smaller) ::.\n" << pool << std::endl;
    ASSERT_EQ(pool.current_stock(), 1);
    ASSERT_EQ(pool.num_pending(), 0);
  }
}

TEST(MemoryPoolTest, standard_allocator) { test_standard_allocator(); }
TEST(MemoryPoolTest, host_allocator) { test_host_allocator(); }
TEST(MemoryPoolTest, device_allocator) { test_device_allocator(); }
TEST(MemoryPoolTest, borrow_return_default_context) {
  test_borrow_return_with_context(false);
}
TEST(MemoryPoolTest, borrow_return_custom_context) {
  test_borrow_return_with_context(true);
}

TEST(MemoryPoolTest, test_borrow_return_lost_context) {
  std::cout << "Unfortunately, there is currently no reliable way to test "
               "safely whether a\n"
            << "stream is alive. Keeping the test around for manual tests.\n";
  if (false) {
    test_borrow_return_lost_context();
  }
}


================================================
FILE: tests/merlin_hashtable_test.cc.cu
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <gtest/gtest.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <array>
#include <iostream>
#include <random>
#include <thread>
#include <unordered_set>
#include <vector>
#include "merlin_hashtable.cuh"
#include "test_util.cuh"

constexpr size_t DIM = 16;
using K = uint64_t;
using V = float;
using S = uint64_t;
using TableOptions = nv::merlin::HashTableOptions;
using BaseAllocator = nv::merlin::BaseAllocator;
using MemoryType = nv::merlin::MemoryType;
using EvictStrategy = nv::merlin::EvictStrategy;

template <class K, class S>
struct EraseIfPredFunctor {
  __forceinline__ __device__ bool operator()(const K& key, S& score,
                                             const K& pattern,
                                             const S& threshold) {
    return ((key & 0x7f > pattern) && (score > threshold));
  }
};

template <class K, class V, class S>
struct EraseIfPredFunctorV2 {
  K pattern;
  S threshold;
  EraseIfPredFunctorV2(K pattern, S threshold)
      : pattern(pattern), threshold(threshold) {}
  template <int GroupSize>
  __forceinline__ __device__ bool operator()(
      const K& key, const V* value, const S& score,
      cg::thread_block_tile<GroupSize>& g) {
    /* evaluate key, score and value. */
    return ((key & 0x7f > pattern) && (score > threshold));
  }
};

template <class K, class V, class S>
struct EraseIfPredFunctorV3 {
  K pattern;
  S threshold;
  int dim;
  EraseIfPredFunctorV3(K pattern, S threshold)
      : pattern(pattern), threshold(threshold) {}
  template <int GroupSize>
  __forceinline__ __device__ bool operator()(
      const K& key, const V* value, const S& score,
      cg::thread_block_tile<GroupSize>& g) {
    /* evaluate key, score and value. */
    bool pred = score < threshold;

    for (int i = 0; i < g.size(); i++) {
      auto cur_value = g.shfl(value, i);
      auto cur_key = g.shfl(key, i);
      bool cur_pred = g.shfl(pred, i);
      if (cur_pred == false) continue;
      unsigned int vote = 0;
      /* evaluate one value cooperatively in one loop. */
      for (int j = g.thread_rank(); j < dim; j += g.size()) {
        if (cur_value[j] != static_cast<V>(cur_key * 0.00001)) cur_pred = false;
        vote = g.ballot(cur_pred == false);
        if (vote != 0) break;
      }
      if (g.thread_rank() == i && vote != 0) pred = false;
    }
    return pred;
  }
};

enum class EraseIfVersion { V1, V2, V3 };

template <class K, class S>
struct ExportIfPredFunctor {
  __forceinline__ __device__ bool operator()(const K& key, S& score,
                                             const K& pattern,
                                             const S& threshold) {
    return score > threshold;
  }
};

class CustomizedAllocator : public virtual BaseAllocator {
 public:
  CustomizedAllocator() {};
  ~CustomizedAllocator() override {};

  void alloc(const MemoryType type, void** ptr, size_t size,
             unsigned int pinned_flags = cudaHostAllocDefault) override {
    switch (type) {
      case MemoryType::Device:
        CUDA_CHECK(cudaMalloc(ptr, size));
        break;
      case MemoryType::Managed:
        CUDA_CHECK(cudaMallocManaged(ptr, size, cudaMemAttachGlobal));
        break;
      case MemoryType::Pinned:
        CUDA_CHECK(cudaMallocHost(ptr, size, pinned_flags));
        break;
      case MemoryType::Host:
        *ptr = std::malloc(size);
        break;
    }
    return;
  }

  void alloc_async(const MemoryType type, void** ptr, size_t size,
                   cudaStream_t stream) override {
    if (type == MemoryType::Device) {
      CUDA_CHECK(cudaMallocAsync(ptr, size, stream));
    } else {
      MERLIN_CHECK(false,
                   "[CustomizedAllocator] alloc_async is only support for "
                   "MemoryType::Device!");
    }
    return;
  }

  void free(const MemoryType type, void* ptr) override {
    if (ptr == nullptr) {
      return;
    }
    switch (type) {
      case MemoryType::Pinned:
        CUDA_CHECK(cudaFreeHost(ptr));
        break;
      case MemoryType::Device:
      case MemoryType::Managed:
        CUDA_CHECK(cudaFree(ptr));
        break;
      case MemoryType::Host:
        std::free(ptr);
        break;
    }
    return;
  }

  void free_async(const MemoryType type, void* ptr,
                  cudaStream_t stream) override {
    if (ptr == nullptr) {
      return;
    }

    if (type == MemoryType::Device) {
      CUDA_CHECK(cudaFreeAsync(ptr, stream));
    } else {
      MERLIN_CHECK(false,
                   "[CustomizedAllocator] free_async is only support for "
                   "MemoryType::Device!");
    }
  }
};

void test_basic(size_t max_hbm_for_vectors) {
  constexpr uint64_t BUCKET_MAX_SIZE = 128;
  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL - (128 + 1);
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  options.reserved_key_start_bit = 2;
  options.num_of_buckets_per_alloc = 32;

  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_new_vectors;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    ASSERT_EQ(table->bucket_count(),
              524287);  // 1 + (INIT_CAPACITY / options.bucket_max_size)
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    table->contains(KEY_NUM, d_keys, d_found, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int contains_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) contains_num++;
    }
    ASSERT_EQ(contains_num, found_num);

    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));
    table->insert_or_assign(KEY_NUM, d_keys,
                            reinterpret_cast<float*>(d_new_vectors), d_scores,
                            stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_new_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    table->find(KEY_NUM, d_keys, reinterpret_cast<float*>(d_new_vectors),
                d_found, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_new_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    found_num = 0;
    uint32_t i_value = 0x2020202;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  *(reinterpret_cast<float*>(&i_value)));
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    table->contains(KEY_NUM, d_keys, d_found, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    contains_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) contains_num++;
    }
    ASSERT_EQ(contains_num, found_num);

    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,
                           stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    table->erase(KEY_NUM >> 1, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, total_size >> 1);

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    found_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    table->contains(KEY_NUM, d_keys, d_found, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    contains_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) contains_num++;
    }
    ASSERT_EQ(contains_num, found_num);

    CUDA_CHECK(cudaMemset(d_keys, 0, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                       d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    ASSERT_EQ(dump_counter, KEY_NUM);
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_new_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_basic_without_rehash(size_t max_hbm_for_vectors) {
  constexpr uint64_t BUCKET_MAX_SIZE = 128;
  constexpr uint64_t NUM_OF_BUCKETS_PER_ALLOC = 2048;
  constexpr uint64_t INIT_CAPACITY =
      64 * 1024 * 1024UL - (NUM_OF_BUCKETS_PER_ALLOC * BUCKET_MAX_SIZE) + 1;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  options.reserved_key_start_bit = 2;
  options.num_of_buckets_per_alloc = NUM_OF_BUCKETS_PER_ALLOC;

  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_new_vectors;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    ASSERT_EQ(table->bucket_count(),
              522241);  // 1 + (INIT_CAPACITY / options.bucket_max_size)
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    table->contains(KEY_NUM, d_keys, d_found, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int contains_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) contains_num++;
    }
    ASSERT_EQ(contains_num, found_num);

    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));
    table->insert_or_assign(KEY_NUM, d_keys,
                            reinterpret_cast<float*>(d_new_vectors), d_scores,
                            stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_new_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    table->find(KEY_NUM, d_keys, reinterpret_cast<float*>(d_new_vectors),
                d_found, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_new_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    found_num = 0;
    uint32_t i_value = 0x2020202;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  *(reinterpret_cast<float*>(&i_value)));
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    table->contains(KEY_NUM, d_keys, d_found, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    contains_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) contains_num++;
    }
    ASSERT_EQ(contains_num, found_num);

    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,
                           stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    table->erase(KEY_NUM >> 1, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, total_size >> 1);

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    found_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    table->contains(KEY_NUM, d_keys, d_found, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    contains_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) contains_num++;
    }
    ASSERT_EQ(contains_num, found_num);

    CUDA_CHECK(cudaMemset(d_keys, 0, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                       d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    ASSERT_EQ(dump_counter, KEY_NUM);
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_new_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

template <typename V>
void test_find_using_pipeline(int dim, bool load_scores) {
  using TableOptions = nv::merlin::HashTableOptions;
  constexpr uint64_t BUCKET_MAX_SIZE = 128;
  constexpr uint64_t INIT_CAPACITY = 128 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 128UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = dim;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(16);

  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V>(options.dim, h_keys, h_scores,
                                         h_vectors, KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
    if (load_scores) {
      table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);
    } else {
      table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);
    }
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
      if (load_scores) ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<V>(h_keys[i] * 0.00001));
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    table->contains(KEY_NUM, d_keys, d_found, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int contains_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) contains_num++;
    }
    ASSERT_EQ(contains_num, found_num);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_basic_when_full(size_t max_hbm_for_vectors) {
  constexpr uint64_t INIT_CAPACITY = 1 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 3;
  options.num_of_buckets_per_alloc = 32;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::unique_ptr<CustomizedAllocator> customized_allocator =
      std::make_unique<CustomizedAllocator>();

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_def_val;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options, customized_allocator.get());
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_insert = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    {
      CUDA_CHECK(cudaMemset(d_def_val, 0, KEY_NUM * sizeof(V) * options.dim));
      table->find(KEY_NUM, d_keys, d_def_val, d_found, nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      int found_num = 0;

      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_vectors, d_def_val,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) {
          found_num++;
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      static_cast<float>(h_keys[i] * 0.00001));
          }
        }
      }
      ASSERT_EQ(total_size_after_insert, found_num);
    }

    table->erase(KEY_NUM, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, 0);

    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    uint64_t total_size_after_reinsert = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_insert, total_size_after_reinsert);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_def_val));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

template <EraseIfVersion EV>
void test_erase_if_pred(size_t max_hbm_for_vectors) {
  constexpr uint64_t INIT_CAPACITY = 256UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 128UL;
  constexpr uint64_t TEST_TIMES = 1;
  constexpr uint64_t BUCKET_MAX_SIZE = 128;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 4;
  options.num_of_buckets_per_alloc = 2;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    test_util::create_keys_in_one_buckets<K, S, V, DIM>(
        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

    K pattern = 100;
    S threshold = 0;
    size_t erase_num = 0;
    if (EV == EraseIfVersion::V1) {
      erase_num = table->template erase_if<EraseIfPredFunctor>(
          pattern, threshold, stream);
    } else if (EV == EraseIfVersion::V2) {
      EraseIfPredFunctorV2<K, V, S> pred(pattern, threshold);
      erase_num = table->template erase_if_v2<EraseIfPredFunctorV2<K, V, S>>(
          pred, stream);
    } else if (EV == EraseIfVersion::V3) {
      EraseIfPredFunctorV3<K, V, S> pred(pattern, threshold);
      pred.dim = options.dim;
      erase_num = table->template erase_if_v2<EraseIfPredFunctorV3<K, V, S>>(
          pred, stream);
    }
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ((erase_num + total_size), BUCKET_MAX_SIZE);

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        ASSERT_EQ(h_scores[i], h_keys[i]);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, (BUCKET_MAX_SIZE - erase_num));

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    table->contains(KEY_NUM, d_keys, d_found, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int contains_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) contains_num++;
    }
    ASSERT_EQ(contains_num, found_num);

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_rehash(size_t max_hbm_for_vectors) {
  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
  constexpr uint64_t INIT_CAPACITY = BUCKET_MAX_SIZE;
  constexpr uint64_t MAX_CAPACITY = 4 * INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = BUCKET_MAX_SIZE * 2;
  constexpr uint64_t TEST_TIMES = 100;
  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 5;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    test_util::create_keys_in_one_buckets<K, S, V, DIM>(
        h_keys, h_scores, h_vectors, KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaDeviceSynchronize());
    ASSERT_EQ(total_size, KEY_NUM);

    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                       d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(dump_counter, KEY_NUM);

    table->reserve(MAX_CAPACITY, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(table->capacity(), MAX_CAPACITY);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    table->find(BUCKET_MAX_SIZE, d_keys, d_vectors, d_found, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {
      if (h_found[i]) {
        found_num++;
        ASSERT_EQ(h_scores[i], h_keys[i]);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, BUCKET_MAX_SIZE);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    table->contains(KEY_NUM, d_keys, d_found, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int contains_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {
      if (h_found[i]) contains_num++;
    }
    ASSERT_EQ(contains_num, found_num);

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_rehash_on_big_batch(size_t max_hbm_for_vectors) {
  constexpr uint64_t INIT_CAPACITY = 1024;
  constexpr uint64_t MAX_CAPACITY = 16 * 1024;
  constexpr uint64_t INIT_KEY_NUM = 1024;
  constexpr uint64_t KEY_NUM = 2048;
  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 6;
  options.num_of_buckets_per_alloc = 8;
  options.max_bucket_size = 128;
  options.max_load_factor = 0.6;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  uint64_t expected_size = 0;
  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  total_size = table->size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(total_size, 0);

  table->insert_or_assign(INIT_KEY_NUM, d_keys, d_vectors, d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  expected_size = INIT_KEY_NUM;

  total_size = table->size(stream);
  CUDA_CHECK(cudaDeviceSynchronize());
  ASSERT_EQ(total_size, expected_size);
  ASSERT_EQ(table->capacity(), (INIT_CAPACITY * 2));

  table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  expected_size = KEY_NUM;

  total_size = table->size(stream);
  CUDA_CHECK(cudaDeviceSynchronize());
  ASSERT_EQ(total_size, expected_size);
  ASSERT_EQ(table->capacity(), KEY_NUM * 4);

  dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                     d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(dump_counter, expected_size);

  CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
  table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int found_num = 0;

  CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(
      cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K), cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                        cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));
  for (int i = 0; i < KEY_NUM; i++) {
    if (h_found[i]) {
      found_num++;
      ASSERT_EQ(h_scores[i], h_keys[i]);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }
  }
  ASSERT_EQ(found_num, KEY_NUM);

  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
  table->contains(KEY_NUM, d_keys, d_found, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int contains_num = 0;
  CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                        cudaMemcpyDeviceToHost));
  for (int i = 0; i < KEY_NUM; i++) {
    if (h_found[i]) contains_num++;
  }
  ASSERT_EQ(contains_num, found_num);

  table->clear(stream);
  total_size = table->size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(total_size, 0);
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_rehash_on_big_batch_specific(size_t max_hbm_for_vectors) {
  constexpr uint64_t INIT_CAPACITY = 50000;
  constexpr uint64_t MAX_CAPACITY = 100000;
  constexpr uint64_t EXPECTED_MAX_CAPACITY = 65536;
  constexpr uint64_t KEY_NUM = 50000;
  K* h_keys;
  S* h_scores;
  V* h_vectors;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 7;
  options.num_of_buckets_per_alloc = 16;
  options.max_bucket_size = 128;
  options.max_load_factor = 0.6;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                              KEY_NUM);

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyHostToDevice));

  total_size = table->size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(total_size, 0);

  table->insert_or_assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(table->capacity(), EXPECTED_MAX_CAPACITY);

  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_dynamic_rehash_on_multi_threads(size_t max_hbm_for_vectors) {
  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;
  constexpr uint64_t INIT_CAPACITY = 4 * 1024 - BUCKET_MAX_SIZE - 1;
  constexpr uint64_t MAX_CAPACITY = 16 * 1024 * INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 256;
  constexpr uint64_t THREAD_N = 8;

  std::vector<std::thread> threads;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 8;
  options.num_of_buckets_per_alloc = 16;
  options.max_load_factor = 0.50f;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::shared_ptr<Table> table = std::make_shared<Table>();
  table->init(options);
  ASSERT_EQ(table->bucket_count(), 32);

  auto worker_function = [&table, KEY_NUM, options](int task_n) {
    K* h_keys;
    V* h_vectors;
    bool* h_found;

    size_t current_capacity = table->capacity();

    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

    K* d_keys;
    V* d_vectors;
    bool* d_found;

    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    while (table->capacity() * 2 < MAX_CAPACITY) {
      test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,
                                                  KEY_NUM);
      CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

      table->insert_or_assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);

      CUDA_CHECK(cudaStreamSynchronize(stream));
      int found_num = 0;

      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));

      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) {
          found_num++;
          for (int j = 0; j < options.dim; j++) {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      static_cast<float>(h_keys[i] * 0.00001));
          }
        }
      }
      ASSERT_EQ(found_num, KEY_NUM);

      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
      table->contains(KEY_NUM, d_keys, d_found, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      int contains_num = 0;
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) contains_num++;
      }
      ASSERT_EQ(contains_num, found_num);

      if (task_n == 0 && current_capacity != table->capacity()) {
        std::cout << "[test_dynamic_rehash_on_multi_threads] The capacity "
                     "changed from "
                  << current_capacity << " to " << table->capacity()
                  << std::endl;
        current_capacity = table->capacity();
      }
      CUDA_CHECK(cudaStreamSynchronize(stream));
    }
    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaFreeHost(h_keys));
    CUDA_CHECK(cudaFreeHost(h_found));
    CUDA_CHECK(cudaFreeHost(h_vectors));

    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_vectors));
    CUDA_CHECK(cudaFree(d_found));
    CUDA_CHECK(cudaDeviceSynchronize());

    CudaCheckError();
  };

  for (int i = 0; i < THREAD_N; ++i)
    threads.emplace_back(std::thread(worker_function, i));

  for (auto& th : threads) {
    th.join();
  }
  ASSERT_GE(table->capacity() * 2, MAX_CAPACITY);
}

void test_export_batch_if(size_t max_hbm_for_vectors) {
  constexpr uint64_t INIT_CAPACITY = 256UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 128UL;
  constexpr uint64_t TEST_TIMES = 1;
  constexpr uint64_t BUCKET_MAX_SIZE = 128ul;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  size_t h_dump_counter = 0;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 9;
  options.num_of_buckets_per_alloc = 2;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;
  size_t* d_dump_counter;
  int found_num = 0;
  bool* h_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));
  CUDA_CHECK(cudaMalloc(&d_dump_counter, sizeof(size_t)));

  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  S threshold = test_util::host_nano<S>();
  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, h_vectors,
                                                KEY_NUM);

    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    found_num = 0;
    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {
      if (h_found[i]) {
        found_num++;
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
    }
    ASSERT_EQ(found_num, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    table->contains(KEY_NUM, d_keys, d_found, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int contains_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < BUCKET_MAX_SIZE; i++) {
      if (h_found[i]) contains_num++;
    }
    ASSERT_EQ(contains_num, found_num);

    K pattern = 100;

    table->template export_batch_if<ExportIfPredFunctor>(
        pattern, threshold, table->capacity(), 0, d_dump_counter, d_keys,
        d_vectors, d_scores, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));

    size_t expected_export_count = 0;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_scores[i] > threshold) expected_export_count++;
    }
    ASSERT_EQ(expected_export_count, h_dump_counter);

    threshold = test_util::host_nano<S>();
    table->template export_batch_if<ExportIfPredFunctor>(
        pattern, threshold, table->capacity(), 0, d_dump_counter, d_keys,
        d_vectors, d_scores, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));
    CUDA_CHECK(cudaMemcpy(&h_dump_counter, d_dump_counter, sizeof(size_t),
                          cudaMemcpyDeviceToHost));

    ASSERT_EQ(0, h_dump_counter);

    CUDA_CHECK(cudaMemset(h_keys, 0, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMemset(h_scores, 0, KEY_NUM * sizeof(S)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < h_dump_counter; i++) {
      ASSERT_GT(h_scores[i], threshold);
      for (int j = 0; j < options.dim; j++) {
        ASSERT_EQ(h_vectors[i * options.dim + j],
                  static_cast<float>(h_keys[i] * 0.00001));
      }
    }

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);
  }
  CUDA_CHECK(cudaDeviceSynchronize());
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaFree(d_dump_counter));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_basic_for_cpu_io() {
  constexpr uint64_t INIT_CAPACITY = 64 * 1024 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;
  constexpr uint64_t TEST_TIMES = 1;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 10;
  options.max_hbm_for_vectors = nv::merlin::GB(0);
  options.io_by_cpu = true;
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

  test_util::create_random_keys<K, S, V, DIM>(h_keys, h_scores, nullptr,
                                              KEY_NUM);

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  V* d_def_val;
  V** d_vectors_ptr;
  bool* d_found;
  size_t dump_counter = 0;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_def_val, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_vectors_ptr, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  CUDA_CHECK(
      cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
  CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                        cudaMemcpyHostToDevice));

  CUDA_CHECK(cudaMemset(d_vectors, 1, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_def_val, 2, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMemset(d_vectors_ptr, 0, KEY_NUM * sizeof(V*)));
  CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  uint64_t total_size = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_vectors, 2, KEY_NUM * sizeof(V) * options.dim));
    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) found_num++;
    }
    ASSERT_EQ(found_num, KEY_NUM);

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    table->contains(KEY_NUM, d_keys, d_found, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int contains_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) contains_num++;
    }
    ASSERT_EQ(contains_num, found_num);

    table->accum_or_assign(KEY_NUM, d_keys, d_vectors, d_found, d_scores,
                           stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, KEY_NUM);

    table->erase(KEY_NUM >> 1, d_keys, stream);
    size_t total_size_after_erase = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size_after_erase, total_size >> 1);

    table->clear(stream);
    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);

    dump_counter = table->export_batch(table->capacity(), 0, d_keys, d_vectors,
                                       d_scores, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(dump_counter, KEY_NUM);
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors, KEY_NUM * sizeof(V) * options.dim,
                        cudaMemcpyDeviceToHost));

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_def_val));
  CUDA_CHECK(cudaFree(d_vectors_ptr));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_lru_basic(size_t max_hbm_for_vectors) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 11;
  options.num_of_buckets_per_alloc = 4;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      S start_ts = test_util::host_nano<S>(stream);
      table->insert_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                              nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts = test_util::host_nano<S>(stream);

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      ASSERT_GE(h_scores_temp_sorted[0], start_ts);
      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1], end_ts);
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      S start_ts = test_util::host_nano<S>(stream);
      table->insert_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                              nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts = test_util::host_nano<S>(stream);

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::array<S, TEMP_KEY_NUM> h_scores_temp_sorted;
      int ctr = 0;
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {
          ASSERT_GT(h_scores_temp[i], BUCKET_MAX_SIZE);
          h_scores_temp_sorted[ctr++] = h_scores_temp[i];
        } else {
          ASSERT_LE(h_scores_temp[i], start_ts);
        }
      }
      std::sort(h_scores_temp_sorted.begin(),
                h_scores_temp_sorted.begin() + ctr);

      ASSERT_GE(h_scores_temp_sorted[0], start_ts);
      ASSERT_LE(h_scores_temp_sorted[ctr - 1], end_ts);
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_lfu_basic(size_t max_hbm_for_vectors) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 1024;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 12;
  options.num_of_buckets_per_alloc = 1;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  int freq_range = 1000;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  for (int i = 0; i < TEST_TIMES; i++) {
    test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
        h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
        BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,
        freq_range);

    test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
        h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
        TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
        0xFFFFFFFFFFFFFFFD, freq_range);

    h_keys_test[2] = h_keys_base[72];
    h_keys_test[3] = h_keys_base[73];

    h_scores_test[2] = h_keys_base[72] % freq_range;
    h_scores_test[3] = h_keys_base[73] % freq_range;

    for (int i = 0; i < options.dim; i++) {
      h_vectors_test[2 * options.dim + i] =
          h_vectors_base[72 * options.dim + i];
      h_vectors_test[3 * options.dim + i] =
          h_vectors_base[73 * options.dim + i];
    }
    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    size_t total_size = 0;
    size_t dump_counter = 0;
    S global_epoch = 1;
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->set_global_epoch(global_epoch);
      table->insert_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                              d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        ASSERT_EQ(h_scores_temp[i], h_keys_temp[i] % freq_range);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->set_global_epoch(global_epoch);
      table->insert_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                              d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        bool in_base =
            h_keys_base.end() !=
            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);
        bool in_test =
            h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);
        if (in_base && in_test) {
          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range) * 2);
        } else {
          ASSERT_EQ(h_scores_temp[i], (h_keys_temp[i] % freq_range));
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
    CUDA_CHECK(cudaStreamDestroy(stream));
  }

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_epochlru_basic(size_t max_hbm_for_vectors) {
  constexpr int RSHIFT_ON_NANO = 20;

  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 13;
  options.num_of_buckets_per_alloc = 8;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      S start_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
      table->set_global_epoch(global_epoch);
      table->insert_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                              nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));
      ASSERT_LE(h_scores_temp_sorted[BASE_KEY_NUM - 1],
                (global_epoch << 32 | end_ts));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      S start_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;
      table->set_global_epoch(global_epoch);
      table->insert_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                              nullptr, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      S end_ts =
          (test_util::host_nano<S>(stream) >> RSHIFT_ON_NANO) & 0xFFFFFFFF;

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted;
      h_scores_temp_sorted.reserve(TEMP_KEY_NUM);
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i])) {
          ASSERT_GE(h_scores_temp[i], (global_epoch << 32 | start_ts));
          h_scores_temp_sorted.push_back(h_scores_temp[i]);
        } else {
          ASSERT_LE(h_scores_temp[i], (global_epoch << 32 | start_ts));
        }
      }
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      if (!h_scores_temp_sorted.empty()) {
        ASSERT_GE(h_scores_temp_sorted[0], (global_epoch << 32 | start_ts));
        ASSERT_LE(h_scores_temp_sorted[h_scores_temp_sorted.size() - 1],
                  (global_epoch << 32 | end_ts));
      }
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_epochlfu_basic(size_t max_hbm_for_vectors) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 4;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 14;
  options.num_of_buckets_per_alloc = 8;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  int freq_range = 1000;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF,
      freq_range);

  test_util::create_keys_in_one_buckets_lfu<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD, freq_range);

  // Simulate overflow of low 32bits.
  h_scores_base[71] = static_cast<S>(std::numeric_limits<uint32_t>::max() -
                                     static_cast<uint32_t>(1));

  h_keys_test[1] = h_keys_base[71];
  h_keys_test[2] = h_keys_base[72];
  h_keys_test[3] = h_keys_base[73];

  h_scores_test[1] = h_scores_base[71];
  h_scores_test[2] = h_keys_base[72] % freq_range;
  h_scores_test[3] = h_keys_base[73] % freq_range;

  for (int i = 0; i < options.dim; i++) {
    h_vectors_test[1 * options.dim + i] = h_vectors_base[71 * options.dim + i];
    h_vectors_test[2 * options.dim + i] = h_vectors_base[72 * options.dim + i];
    h_vectors_test[3 * options.dim + i] = h_vectors_base[73 * options.dim + i];
  }
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  S global_epoch = 1;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->set_global_epoch(global_epoch);
      table->insert_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                              d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < dump_counter; i++) {
        if (h_keys_temp[i] == h_keys_base[71]) {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, h_scores_base[71]);
          ASSERT_EQ(h_scores_temp[i], expected_score);
        } else {
          S expected_score = test_util::make_expected_score_for_epochlfu<S>(
              global_epoch, (h_keys_temp[i] % freq_range));
          ASSERT_EQ(h_scores_temp[i], expected_score);
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      global_epoch++;
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->set_global_epoch(global_epoch);
      table->insert_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                              d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      ASSERT_TRUE(h_keys_temp.end() != std::find(h_keys_temp.begin(),
                                                 h_keys_temp.end(),
                                                 h_keys_base[71]));

      for (int i = 0; i < dump_counter; i++) {
        bool in_base =
            h_keys_base.end() !=
            std::find(h_keys_base.begin(), h_keys_base.end(), h_keys_temp[i]);
        bool in_test =
            h_keys_test.end() !=
            std::find(h_keys_test.begin(), h_keys_test.end(), h_keys_temp[i]);

        if (in_base && in_test) {
          if (h_keys_temp[i] == h_keys_base[71]) {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch, h_scores_base[71] * 2);
            ASSERT_EQ(h_scores_temp[i], expected_score);
          } else {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch, (h_keys_temp[i] % freq_range) * 2);
            ASSERT_EQ(h_scores_temp[i], expected_score);
          }
        } else {
          if (h_keys_temp[i] == h_keys_base[71]) {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch - static_cast<S>(in_base), h_scores_base[71]);
            ASSERT_EQ(h_scores_temp[i], expected_score);
          } else {
            S expected_score = test_util::make_expected_score_for_epochlfu<S>(
                global_epoch - static_cast<S>(in_base),
                (h_keys_temp[i] % freq_range));

            ASSERT_EQ(h_scores_temp[i], expected_score);
          }
        }
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_basic(size_t max_hbm_for_vectors) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 128;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 128;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 15;
  options.num_of_buckets_per_alloc = 8;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  const S base_score_start = 1000;
  for (int i = 0; i < BASE_KEY_NUM; i++) {
    h_scores_base[i] = base_score_start + i;
  }

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);
  const S test_score_start = base_score_start + BASE_KEY_NUM;
  for (int i = 0; i < TEST_KEY_NUM; i++) {
    h_scores_test[i] = test_score_start + i;
  }
  for (int i = 64; i < TEST_KEY_NUM; i++) {
    h_keys_test[i] = h_keys_base[i];
    for (int j = 0; j < options.dim; j++) {
      h_vectors_test[i * options.dim + j] = h_vectors_base[i * options.dim + j];
    }
  }

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->insert_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                              d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->insert_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                              d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range_test =
          test_util::range<S, TEST_KEY_NUM>(test_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range_test.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_advanced(size_t max_hbm_for_vectors) {
  constexpr uint64_t BUCKET_NUM = 8UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t INIT_CAPACITY = BUCKET_NUM * BUCKET_MAX_SIZE;  // 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t BASE_KEY_NUM = BUCKET_MAX_SIZE;
  constexpr uint64_t TEST_KEY_NUM = 8;
  constexpr uint64_t TEMP_KEY_NUM =
      (BASE_KEY_NUM > TEST_KEY_NUM) ? BASE_KEY_NUM : TEST_KEY_NUM;
  constexpr uint64_t TEST_TIMES = 256;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 16;
  options.num_of_buckets_per_alloc = 8;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  std::vector<K> h_keys_base(BASE_KEY_NUM);
  std::vector<S> h_scores_base(BASE_KEY_NUM);
  std::vector<V> h_vectors_base(BASE_KEY_NUM * DIM);

  std::vector<K> h_keys_test(TEST_KEY_NUM);
  std::vector<S> h_scores_test(TEST_KEY_NUM);
  std::vector<V> h_vectors_test(TEST_KEY_NUM * DIM);

  std::vector<K> h_keys_temp(TEMP_KEY_NUM);
  std::vector<S> h_scores_temp(TEMP_KEY_NUM);
  std::vector<V> h_vectors_temp(TEMP_KEY_NUM * DIM);

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, TEMP_KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, TEMP_KEY_NUM * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, TEMP_KEY_NUM * sizeof(V) * options.dim));

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_base.data(), h_scores_base.data(), h_vectors_base.data(),
      BASE_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0, 0x3FFFFFFFFFFFFFFF);

  const S base_score_start = 1000;
  for (int i = 0; i < BASE_KEY_NUM; i++) {
    h_scores_base[i] = base_score_start + i;
  }

  test_util::create_keys_in_one_buckets<K, S, V, DIM>(
      h_keys_test.data(), h_scores_test.data(), h_vectors_test.data(),
      TEST_KEY_NUM, INIT_CAPACITY, BUCKET_MAX_SIZE, 1, 0x3FFFFFFFFFFFFFFF,
      0xFFFFFFFFFFFFFFFD);

  h_keys_test[4] = h_keys_base[72];
  h_keys_test[5] = h_keys_base[73];
  h_keys_test[6] = h_keys_base[74];
  h_keys_test[7] = h_keys_base[75];

  // replace four new keys to lower scores, would not be inserted.
  h_scores_test[0] = 20;
  h_scores_test[1] = 78;
  h_scores_test[2] = 97;
  h_scores_test[3] = 98;

  // replace three exist keys to new scores, just refresh the score for them.
  h_scores_test[4] = 99;
  h_scores_test[5] = 1010;
  h_scores_test[6] = 1020;
  h_scores_test[7] = 1035;

  for (int i = 4; i < TEST_KEY_NUM; i++) {
    for (int j = 0; j < options.dim; j++) {
      h_vectors_test[i * options.dim + j] =
          static_cast<V>(h_keys_test[i] * 0.00001);
    }
  }

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t dump_counter = 0;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    ASSERT_EQ(table->bucket_count(), BUCKET_NUM);

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base.data(),
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base.data(),
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base.data(),
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->insert_or_assign(BASE_KEY_NUM, d_keys_temp, d_vectors_temp,
                              d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            BASE_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            BASE_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            BASE_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      std::vector<S> h_scores_temp_sorted(h_scores_temp);
      std::sort(h_scores_temp_sorted.begin(), h_scores_temp_sorted.end());

      auto expected_range = test_util::range<S, TEMP_KEY_NUM>(base_score_start);
      ASSERT_TRUE(std::equal(h_scores_temp_sorted.begin(),
                             h_scores_temp_sorted.end(),
                             expected_range.begin()));
      for (int i = 0; i < dump_counter; i++) {
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }

    {
      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_test.data(),
                            TEST_KEY_NUM * sizeof(K), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_test.data(),
                            TEST_KEY_NUM * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_test.data(),
                            TEST_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->insert_or_assign(TEST_KEY_NUM, d_keys_temp, d_vectors_temp,
                              d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, BUCKET_MAX_SIZE);

      dump_counter = table->export_batch(table->capacity(), 0, d_keys_temp,
                                         d_vectors_temp, d_scores_temp, stream);
      ASSERT_EQ(dump_counter, BUCKET_MAX_SIZE);

      CUDA_CHECK(cudaMemcpy(h_keys_temp.data(), d_keys_temp,
                            TEMP_KEY_NUM * sizeof(K), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp.data(), d_scores_temp,
                            TEMP_KEY_NUM * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp.data(), d_vectors_temp,
                            TEMP_KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      for (int i = 0; i < TEST_KEY_NUM; i++) {
        if (i < 4) {
          ASSERT_EQ(h_keys_temp.end(),
                    std::find(h_keys_temp.begin(), h_keys_temp.end(),
                              h_keys_test[i]));
        } else {
          ASSERT_NE(h_keys_temp.end(),
                    std::find(h_keys_temp.begin(), h_keys_temp.end(),
                              h_keys_test[i]));
        }
      }
      for (int i = 0; i < TEMP_KEY_NUM; i++) {
        if (h_keys_temp[i] == h_keys_test[4])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[4]);
        if (h_keys_temp[i] == h_keys_test[5])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[5]);
        if (h_keys_temp[i] == h_keys_test[6])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[6]);
        if (h_keys_temp[i] == h_keys_test[7])
          ASSERT_EQ(h_scores_temp[i], h_scores_test[7]);

        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_evict_strategy_customized_correct_rate(size_t max_hbm_for_vectors) {
  constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;
  constexpr uint64_t STEPS = 128;
  constexpr uint64_t MAX_BUCKET_SIZE = 128;
  constexpr uint64_t INIT_CAPACITY = BATCH_SIZE * STEPS;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t TEST_TIMES = 1;
  float expected_correct_rate = 0.964;
  const int rounds = 12;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 17;
  options.num_of_buckets_per_alloc = 128;
  options.max_bucket_size = MAX_BUCKET_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  K* h_keys_base = test_util::HostBuffer<K>(BATCH_SIZE).ptr();
  S* h_scores_base = test_util::HostBuffer<S>(BATCH_SIZE).ptr();
  V* h_vectors_base = test_util::HostBuffer<V>(BATCH_SIZE * options.dim).ptr();

  K* h_keys_temp = test_util::HostBuffer<K>(MAX_CAPACITY).ptr();
  S* h_scores_temp = test_util::HostBuffer<S>(MAX_CAPACITY).ptr();
  V* h_vectors_temp =
      test_util::HostBuffer<V>(MAX_CAPACITY * options.dim).ptr();

  K* d_keys_temp;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, MAX_CAPACITY * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, MAX_CAPACITY * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, MAX_CAPACITY * sizeof(V) * options.dim));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  size_t total_size = 0;
  size_t global_start_key = 100000;
  for (int i = 0; i < TEST_TIMES; i++) {
    std::unique_ptr<Table> table = std::make_unique<Table>();
    table->init(options);
    size_t start_key = global_start_key;

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_EQ(total_size, 0);

    for (int r = 0; r < rounds; r++) {
      size_t expected_min_key = global_start_key + INIT_CAPACITY * r;
      size_t expected_max_key = global_start_key + INIT_CAPACITY * (r + 1) - 1;
      size_t expected_table_size =
          (r == 0) ? size_t(expected_correct_rate * INIT_CAPACITY)
                   : INIT_CAPACITY;

      for (int s = 0; s < STEPS; s++) {
        test_util::create_continuous_keys<K, S, V, DIM>(
            h_keys_base, h_scores_base, h_vectors_base, BATCH_SIZE, start_key);
        start_key += BATCH_SIZE;

        CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base, BATCH_SIZE * sizeof(K),
                              cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base,
                              BATCH_SIZE * sizeof(S), cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base,
                              BATCH_SIZE * sizeof(V) * options.dim,
                              cudaMemcpyHostToDevice));
        table->insert_or_assign(BATCH_SIZE, d_keys_temp, d_vectors_temp,
                                d_scores_temp, stream);
        CUDA_CHECK(cudaStreamSynchronize(stream));
      }

      size_t total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_GE(total_size, expected_table_size);
      ASSERT_EQ(MAX_CAPACITY, table->capacity());

      size_t dump_counter = table->export_batch(
          MAX_CAPACITY, 0, d_keys_temp, d_vectors_temp, d_scores_temp, stream);

      CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys_temp, MAX_CAPACITY * sizeof(K),
                            cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_scores_temp, d_scores_temp,
                            MAX_CAPACITY * sizeof(S), cudaMemcpyDefault));
      CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors_temp,
                            MAX_CAPACITY * sizeof(V) * options.dim,
                            cudaMemcpyDefault));

      ASSERT_EQ(total_size, dump_counter);
      size_t bigger_score_counter = 0;
      K max_key = 0;

      for (int i = 0; i < dump_counter; i++) {
        ASSERT_EQ(h_keys_temp[i], h_scores_temp[i]);
        max_key = std::max(max_key, h_keys_temp[i]);
        if (h_scores_temp[i] >= expected_min_key) bigger_score_counter++;
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors_temp[i * options.dim + j],
                    static_cast<float>(h_keys_temp[i] * 0.00001));
        }
      }

      float correct_rate = (bigger_score_counter * 1.0) / MAX_CAPACITY;
      std::cout << std::setprecision(3) << "[Round " << r << "]"
                << "correct_rate=" << correct_rate << std::endl;
      ASSERT_GE(max_key, expected_max_key);
      ASSERT_GE(correct_rate, expected_correct_rate);
    }
  }
  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

void test_insert_or_assign_multi_threads(size_t max_hbm_for_vectors,
                                         const float BATCH_0_RATIO,
                                         const float BATCH_1_RATIO,
                                         bool capacity_silent = true) {
  const uint64_t THREAD_N = 64UL;
  const uint64_t BATCH_0_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_0_RATIO);
  const uint64_t BATCH_1_SIZE = static_cast<uint64_t>(THREAD_N * BATCH_1_RATIO);
  const uint64_t BATCH_2_SIZE = THREAD_N - BATCH_0_SIZE - BATCH_1_SIZE;

  const uint64_t INIT_CAPACITY = 32 * 1024 * 1024UL;
  const uint64_t MAX_CAPACITY = 128 * 1024 * 1024UL;
  constexpr uint64_t BUCKET_MAX_SIZE = 128UL;
  constexpr uint64_t KEY_NUM = 1 * 1024 * 1024UL;

  std::vector<std::thread> threads;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_load_factor = 0.50f;
  options.max_bucket_size = BUCKET_MAX_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;

  std::shared_ptr<Table> table = std::make_shared<Table>();
  table->init(options);
  // assume every key is different
  auto worker1 = [&table, KEY_NUM, options, capacity_silent](int batch,
                                                             int task_n) {
    K* h_keys;
    V* h_vectors;
    bool* h_found;

    size_t current_capacity = table->capacity();

    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

    K* d_keys;
    V* d_vectors;
    bool* d_found;

    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,
                                                KEY_NUM);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    {
      int found_num = 0;
      CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) {
          found_num++;
        }
      }
      ASSERT_EQ(found_num, 0);
    }

    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    thread_local bool print_unequal{false};
    thread_local uint64_t err_times{0};
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        for (int j = 0; j < options.dim; j++) {
          if (batch == 2) {
            if (h_vectors[i * options.dim + j] !=
                static_cast<float>(h_keys[i] * 0.00001)) {
              if (!print_unequal) {
                std::cout << " [Thread " << task_n << "]\t";
                UNEQUAL_EXPR(h_vectors[i * options.dim + j],
                             static_cast<float>(h_keys[i] * 0.00001));
                print_unequal = true;
              }
              err_times += 1;
            }
          } else {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      static_cast<float>(h_keys[i] * 0.00001));
          }
        }
      }
    }

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    table->contains(KEY_NUM, d_keys, d_found, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int contains_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) contains_num++;
    }
    ASSERT_EQ(contains_num, found_num);

    bool print_thread_id{false};
    if (batch == 0 || batch == 1) {
      ASSERT_EQ(found_num, KEY_NUM);
      ASSERT_EQ(err_times, 0);
    } else {
      if (found_num != KEY_NUM or err_times != 0) {
        std::cout << " [Thread " << task_n << "]\t"
                  << "Number of keys(insert/found/error) : " << "(" << KEY_NUM
                  << "/" << found_num << "/" << err_times << ") \t";
        print_thread_id = true;
      }
    }
    if (current_capacity != table->capacity() && !capacity_silent) {
      if (!print_thread_id) std::cout << " [Thread " << task_n << "]\t";

      std::cout << "The capacity changed from " << current_capacity << " to "
                << table->capacity() << std::endl;
    } else if (print_thread_id) {
      std::cout << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaFreeHost(h_keys));
    CUDA_CHECK(cudaFreeHost(h_found));
    CUDA_CHECK(cudaFreeHost(h_vectors));

    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_vectors));
    CUDA_CHECK(cudaFree(d_found));
    CUDA_CHECK(cudaDeviceSynchronize());

    CudaCheckError();
  };
  auto worker2 = [&table, KEY_NUM, options, capacity_silent](int batch,
                                                             int task_n) {
    K* h_keys;
    V* h_vectors;
    bool* h_found;

    size_t current_capacity = table->capacity();

    CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

    K* d_keys;
    V* d_vectors;
    V* d_new_vectors;
    bool* d_found;

    CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
    CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_new_vectors, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    test_util::create_random_keys<K, S, V, DIM>(h_keys, nullptr, h_vectors,
                                                KEY_NUM);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(d_new_vectors, 2, KEY_NUM * sizeof(V) * options.dim));

    table->insert_or_assign(KEY_NUM, d_keys, d_vectors, nullptr, stream);
    table->insert_or_assign(KEY_NUM, d_keys, d_new_vectors, nullptr, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);

    CUDA_CHECK(cudaStreamSynchronize(stream));
    int found_num = 0;

    CUDA_CHECK(cudaMemset(h_found, 0, KEY_NUM * sizeof(bool)));
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
    CUDA_CHECK(cudaMemcpy(h_keys, d_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyDeviceToHost));
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));
    thread_local bool print_unequal{false};
    thread_local uint64_t err_times{0};
    uint32_t i_value = 0x2020202;
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) {
        found_num++;
        for (int j = 0; j < options.dim; j++) {
          if (batch == 2) {
            if (h_vectors[i * options.dim + j] !=
                *(reinterpret_cast<float*>(&i_value))) {
              if (!print_unequal) {
                std::cout << " [Thread " << task_n << "]\t";
                UNEQUAL_EXPR(h_vectors[i * options.dim + j],
                             *(reinterpret_cast<float*>(&i_value)));
                print_unequal = true;
              }
              err_times += 1;
            }
          } else {
            ASSERT_EQ(h_vectors[i * options.dim + j],
                      *(reinterpret_cast<float*>(&i_value)));
          }
        }
      }
    }

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
    table->contains(KEY_NUM, d_keys, d_found, stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    int contains_num = 0;
    CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < KEY_NUM; i++) {
      if (h_found[i]) contains_num++;
    }
    ASSERT_EQ(contains_num, found_num);

    bool print_thread_id{false};
    if (batch == 0 || batch == 1) {
      ASSERT_EQ(found_num, KEY_NUM);
      ASSERT_EQ(err_times, 0);
    } else {
      if (found_num != KEY_NUM or err_times != 0) {
        std::cout << " [Thread " << task_n << "]\t"
                  << "Number of keys(insert/found/error) : " << "(" << KEY_NUM
                  << "/" << found_num << "/" << err_times << ") \t";
        print_thread_id = true;
      }
    }
    if (current_capacity != table->capacity() && !capacity_silent) {
      if (!print_thread_id) std::cout << " [Thread " << task_n << "]\t";

      std::cout << "The capacity changed from " << current_capacity << " to "
                << table->capacity() << std::endl;
    } else if (print_thread_id) {
      std::cout << std::endl;
    }

    CUDA_CHECK(cudaStreamSynchronize(stream));

    CUDA_CHECK(cudaStreamDestroy(stream));

    CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyDeviceToHost));

    CUDA_CHECK(cudaFreeHost(h_keys));
    CUDA_CHECK(cudaFreeHost(h_found));
    CUDA_CHECK(cudaFreeHost(h_vectors));

    CUDA_CHECK(cudaFree(d_keys));
    CUDA_CHECK(cudaFree(d_vectors));
    CUDA_CHECK(cudaFree(d_new_vectors));
    CUDA_CHECK(cudaFree(d_found));
    CUDA_CHECK(cudaDeviceSynchronize());

    CudaCheckError();
  };

  /* the table is relative idle, and assume there is no eviction */
  int batch = 0;
  std::cout << "[Batch 0] " << BATCH_0_SIZE << " threads\n";
  for (int i = 0; i < BATCH_0_SIZE; i += 2) {
    threads.emplace_back(std::thread(worker1, batch, i));
    threads.emplace_back(std::thread(worker2, batch, i + 1));
  }
  for (auto& th : threads) {
    th.join();
  }
  threads.clear();

  /* test the correct of APIs serially */
  batch = 1;
  std::cout << "[Batch 1] " << BATCH_1_SIZE << " threads\n";
  for (int i = BATCH_0_SIZE; i < BATCH_0_SIZE + BATCH_1_SIZE; i += 2) {
    auto th = std::thread(worker1, batch, i);
    th.join();
    th = std::thread(worker2, batch, i + 1);
    th.join();
  }

  /* eviction may occur */
  batch = 2;
  std::cout << "[Batch 2] " << BATCH_2_SIZE << " threads\n";
  for (int i = BATCH_0_SIZE + BATCH_1_SIZE; i < THREAD_N; i += 2) {
    threads.emplace_back(std::thread(worker1, batch, i));
    threads.emplace_back(std::thread(worker2, batch, i + 1));
  }
  for (auto& th : threads) {
    th.join();
  }
  ASSERT_EQ(table->capacity(), MAX_CAPACITY);
}

template <typename K, typename V, typename S, typename Table, size_t dim = 64>
void CheckInsertOrAssignValues(Table* table, K* keys, V* values, S* scores,
                               size_t len, cudaStream_t stream) {
  std::map<K, test_util::ValueArray<V, dim>> map_before_insert;
  std::map<K, test_util::ValueArray<V, dim>> map_after_insert;
  K* h_tmp_keys = nullptr;
  V* h_tmp_values = nullptr;
  S* h_tmp_scores = nullptr;

  K* d_tmp_keys = nullptr;
  V* d_tmp_values = nullptr;
  S* d_tmp_scores = nullptr;

  size_t table_size_before = table->size(stream);
  size_t cap = table_size_before + len;

  CUDA_CHECK(cudaMallocAsync(&d_tmp_keys, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_keys, 0, cap * sizeof(K), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_values, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_values, 0, cap * dim * sizeof(V), stream));
  CUDA_CHECK(cudaMallocAsync(&d_tmp_scores, cap * sizeof(S), stream));
  CUDA_CHECK(cudaMemsetAsync(d_tmp_scores, 0, cap * sizeof(S), stream));
  h_tmp_keys = (K*)malloc(cap * sizeof(K));
  h_tmp_values = (V*)malloc(cap * dim * sizeof(V));
  h_tmp_scores = (S*)malloc(cap * sizeof(S));

  size_t table_size_verify0 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);
  ASSERT_EQ(table_size_before, table_size_verify0);

  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_before * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_before * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_before * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));

  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < table_size_verify0; i++) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    map_before_insert[h_tmp_keys[i]] = *vec;
  }

  auto start = std::chrono::steady_clock::now();
  table->insert_or_assign(len, keys, values, nullptr, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  auto end = std::chrono::steady_clock::now();
  auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

  float dur = diff.count();

  size_t table_size_after = table->size(stream);
  size_t table_size_verify1 = table->export_batch(
      table->capacity(), 0, d_tmp_keys, d_tmp_values, d_tmp_scores, stream);

  ASSERT_EQ(table_size_verify1, table_size_after);

  size_t new_cap = table_size_after;
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_keys, d_tmp_keys,
                             table_size_after * sizeof(K),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_values, d_tmp_values,
                             table_size_after * dim * sizeof(V),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaMemcpyAsync(h_tmp_scores, d_tmp_scores,
                             table_size_after * sizeof(S),
                             cudaMemcpyDeviceToHost, stream));
  CUDA_CHECK(cudaStreamSynchronize(stream));
  int64_t new_cap_K = (int64_t)new_cap;
  for (int64_t i = new_cap_K - 1; i >= 0; i--) {
    test_util::ValueArray<V, dim>* vec =
        reinterpret_cast<test_util::ValueArray<V, dim>*>(h_tmp_values +
                                                         i * dim);
    map_after_insert[h_tmp_keys[i]] = *vec;
  }

  size_t value_diff_cnt = 0;
  for (auto& it : map_after_insert) {
    test_util::ValueArray<V, dim>& vec = map_after_insert.at(it.first);
    for (size_t j = 0; j < dim; j++) {
      if (vec[j] != static_cast<float>(it.first * 0.00001)) {
        ++value_diff_cnt;
        break;
      }
    }
  }
  ASSERT_EQ(value_diff_cnt, 0);
  std::cout << "Check insert behavior got value_diff_cnt: " << value_diff_cnt
            << ", while table_size_before: " << table_size_before
            << ", while table_size_after: " << table_size_after
            << ", while len: " << len << std::endl;

  CUDA_CHECK(cudaFreeAsync(d_tmp_keys, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_values, stream));
  CUDA_CHECK(cudaFreeAsync(d_tmp_scores, stream));
  free(h_tmp_keys);
  free(h_tmp_values);
  free(h_tmp_scores);
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

void test_insert_or_assign_values_check(size_t max_hbm_for_vectors) {
  const size_t U = 524288;
  const size_t init_capacity = 1024;
  const size_t B = 524288 + 13;
  constexpr size_t dim = 64;

  TableOptions opt;

  opt.max_capacity = U;
  opt.init_capacity = init_capacity;
  opt.max_hbm_for_vectors = nv::merlin::GB(max_hbm_for_vectors);
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>;
  opt.dim = 64;

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(opt);

  test_util::KVMSBuffer<K, V, S> data_buffer;
  data_buffer.Reserve(B, dim, stream);

  size_t offset = 0;
  S score = 0;
  for (int i = 0; i < 20; i++) {
    test_util::create_random_keys<K, S, V, dim>(
        data_buffer.keys_ptr(false), data_buffer.scores_ptr(false),
        data_buffer.values_ptr(false), (int)B, B * 16);
    data_buffer.SyncData(true, stream);

    CheckInsertOrAssignValues<K, V, S, Table, dim>(
        table.get(), data_buffer.keys_ptr(), data_buffer.values_ptr(),
        data_buffer.scores_ptr(), B, stream);

    offset += B;
    score += 1;
  }
}

void test_bucket_size(bool load_scores = true) {
  constexpr uint64_t INIT_CAPACITY = 128 * 1024UL;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr uint64_t KEY_NUM = 128UL;
  constexpr uint64_t TEST_TIMES = 1;
  constexpr uint32_t DIM = 4;

  K* h_keys;
  S* h_scores;
  V* h_vectors;
  bool* h_found;

  TableOptions options;

  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.max_hbm_for_vectors = nv::merlin::GB(16);
  options.reserved_key_start_bit = 1;
  options.num_of_buckets_per_alloc = 2;
  using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

  CUDA_CHECK(cudaMallocHost(&h_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMallocHost(&h_found, KEY_NUM * sizeof(bool)));

  K* d_keys;
  S* d_scores = nullptr;
  V* d_vectors;
  bool* d_found;

  CUDA_CHECK(cudaMalloc(&d_keys, KEY_NUM * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores, KEY_NUM * sizeof(S)));
  CUDA_CHECK(cudaMalloc(&d_vectors, KEY_NUM * sizeof(V) * options.dim));
  CUDA_CHECK(cudaMalloc(&d_found, KEY_NUM * sizeof(bool)));

  uint64_t lowerBound = 8;
  uint64_t upperBound = 2048;
  for (uint64_t bucket_max_size = lowerBound; bucket_max_size <= upperBound;
       bucket_max_size *= 2) {
    options.max_bucket_size = bucket_max_size;
    CUDA_CHECK(cudaMemset(h_vectors, 0, KEY_NUM * sizeof(V) * options.dim));

    test_util::create_random_keys<K, S, V>(options.dim, h_keys, h_scores,
                                           h_vectors, KEY_NUM);
    CUDA_CHECK(cudaMemcpy(d_keys, h_keys, KEY_NUM * sizeof(K),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_scores, h_scores, KEY_NUM * sizeof(S),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_vectors, h_vectors,
                          KEY_NUM * sizeof(V) * options.dim,
                          cudaMemcpyHostToDevice));

    CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));

    cudaStream_t stream;
    CUDA_CHECK(cudaStreamCreate(&stream));

    uint64_t total_size = 0;
    for (int i = 0; i < TEST_TIMES; i++) {
      std::unique_ptr<Table> table = std::make_unique<Table>();
      table->init(options);

      total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, 0);

      table->insert_or_assign(KEY_NUM, d_keys, d_vectors, d_scores, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));

      total_size = table->size(stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      ASSERT_EQ(total_size, KEY_NUM);

      CUDA_CHECK(cudaMemset(d_vectors, 0, KEY_NUM * sizeof(V) * options.dim));
      CUDA_CHECK(cudaMemset(d_scores, 0, KEY_NUM * sizeof(S)));
      if (load_scores) {
        table->find(KEY_NUM, d_keys, d_vectors, d_found, d_scores, stream);
      } else {
        table->find(KEY_NUM, d_keys, d_vectors, d_found, nullptr, stream);
      }
      CUDA_CHECK(cudaStreamSynchronize(stream));
      int found_num = 0;
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));
      CUDA_CHECK(cudaMemcpy(h_scores, d_scores, KEY_NUM * sizeof(S),
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(h_vectors, d_vectors,
                            KEY_NUM * sizeof(V) * options.dim,
                            cudaMemcpyDeviceToHost));

      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) found_num++;
        if (load_scores) ASSERT_EQ(h_scores[i], h_keys[i]);
        for (int j = 0; j < options.dim; j++) {
          ASSERT_EQ(h_vectors[i * options.dim + j],
                    static_cast<float>(h_keys[i] * 0.00001));
        }
      }
      ASSERT_EQ(found_num, KEY_NUM);

      CUDA_CHECK(cudaMemset(d_found, 0, KEY_NUM * sizeof(bool)));
      table->contains(KEY_NUM, d_keys, d_found, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
      int contains_num = 0;
      CUDA_CHECK(cudaMemcpy(h_found, d_found, KEY_NUM * sizeof(bool),
                            cudaMemcpyDeviceToHost));
      for (int i = 0; i < KEY_NUM; i++) {
        if (h_found[i]) contains_num++;
      }
      ASSERT_EQ(contains_num, found_num);
    }
    CUDA_CHECK(cudaStreamDestroy(stream));
  }

  CUDA_CHECK(cudaFreeHost(h_keys));
  CUDA_CHECK(cudaFreeHost(h_scores));
  CUDA_CHECK(cudaFreeHost(h_vectors));
  CUDA_CHECK(cudaFreeHost(h_found));

  CUDA_CHECK(cudaFree(d_keys));
  CUDA_CHECK(cudaFree(d_scores));
  CUDA_CHECK(cudaFree(d_vectors));
  CUDA_CHECK(cudaFree(d_found));
  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}

TEST(MerlinHashTableTest, test_export_batch_if) {
  test_export_batch_if(16);
  test_export_batch_if(0);
}
TEST(MerlinHashTableTest, test_insert_or_assign_multi_threads) {
  test_insert_or_assign_multi_threads(16, 0.25f, 0.125f);
  test_insert_or_assign_multi_threads(16, 0.375f, 0.125f);
  test_insert_or_assign_multi_threads(0, 0.25f, 0.125f);
  test_insert_or_assign_multi_threads(0, 0.375f, 0.125f);
}
TEST(MerlinHashTableTest, test_basic) {
  test_basic(16);
  test_basic(0);
}
TEST(MerlinHashTableTest, test_basic_without_rehash) {
  test_basic_without_rehash(16);
  test_basic_without_rehash(0);
}
TEST(MerlinHashTableTest, test_bucket_size) { test_bucket_size(); }
TEST(MerlinHashTableTest, test_find_using_pipeline) {
  test_find_using_pipeline<int32_t>(224, true);
  test_find_using_pipeline<uint32_t>(202, true);
  test_find_using_pipeline<float>(129, true);

  test_find_using_pipeline<float>(128, true);
  test_find_using_pipeline<int32_t>(66, false);
  test_find_using_pipeline<uint32_t>(3, false);
  test_find_using_pipeline<double>(3, true);

  test_find_using_pipeline<int16_t>(128, true);
  test_find_using_pipeline<int8_t>(66, false);
  test_find_using_pipeline<uint16_t>(3, false);
  test_find_using_pipeline<uint8_t>(3, true);
}
TEST(MerlinHashTableTest, test_basic_when_full) {
  test_basic_when_full(16);
  test_basic_when_full(0);
}
TEST(MerlinHashTableTest, test_erase_if_pred) {
  test_erase_if_pred<EraseIfVersion::V1>(16);
  test_erase_if_pred<EraseIfVersion::V1>(0);
  test_erase_if_pred<EraseIfVersion::V2>(16);
  test_erase_if_pred<EraseIfVersion::V3>(16);
}
TEST(MerlinHashTableTest, test_rehash) {
  test_rehash(16);
  test_rehash(0);
}
TEST(MerlinHashTableTest, test_rehash_on_big_batch_specific) {
  test_rehash_on_big_batch_specific(16);
  test_rehash_on_big_batch_specific(0);
}
TEST(MerlinHashTableTest, test_rehash_on_big_batch) {
  test_rehash_on_big_batch(16);
  test_rehash_on_big_batch(0);
}
TEST(MerlinHashTableTest, test_dynamic_rehash_on_multi_threads) {
  test_dynamic_rehash_on_multi_threads(16);
  test_dynamic_rehash_on_multi_threads(0);
}
TEST(MerlinHashTableTest, test_basic_for_cpu_io) { test_basic_for_cpu_io(); }

TEST(MerlinHashTableTest, test_evict_strategy_lru_basic) {
  test_evict_strategy_lru_basic(16);
  test_evict_strategy_lru_basic(0);
}

TEST(MerlinHashTableTest, test_evict_strategy_lfu_basic) {
  test_evict_strategy_lfu_basic(16);
  // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.
  // test_evict_strategy_lfu_basic(0);
}

TEST(MerlinHashTableTest, test_evict_strategy_epochlru_basic) {
  test_evict_strategy_epochlru_basic(16);
  test_evict_strategy_epochlru_basic(0);
}

TEST(MerlinHashTableTest, test_evict_strategy_epochlfu_basic) {
  test_evict_strategy_epochlfu_basic(16);
  test_evict_strategy_epochlfu_basic(0);
}

TEST(MerlinHashTableTest, test_evict_strategy_customized_basic) {
  test_evict_strategy_customized_basic(16);
  test_evict_strategy_customized_basic(0);
}

TEST(MerlinHashTableTest, test_evict_strategy_customized_advanced) {
  test_evict_strategy_customized_advanced(16);
  test_evict_strategy_customized_advanced(0);
}

TEST(MerlinHashTableTest, test_evict_strategy_customized_correct_rate) {
  test_evict_strategy_customized_correct_rate(16);
  // TODO(rhdong): after blossom CI issue is resolved, the skip logic.
  const bool skip_hmem_check = (nullptr != std::getenv("IS_BLOSSOM_CI"));
  if (!skip_hmem_check) {
    test_evict_strategy_customized_correct_rate(0);
  } else {
    std::cout << "The HMEM check is skipped in blossom CI!" << std::endl;
  }
}

TEST(MerlinHashTableTest, test_insert_or_assign_values_check) {
  test_insert_or_assign_values_check(16);
  // TODO(rhdong): Add back when diff error issue fixed in hybrid mode.
  test_insert_or_assign_values_check(0);
}


================================================
FILE: tests/reserved_keys_test.cc.cu
================================================
/*
 * Copyright (c) 2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <gtest/gtest.h>
#include <cstdint>
#include "merlin/types.cuh"
#include "merlin/utils.cuh"
#include "test_util.cuh"

using namespace nv::merlin;

__global__ void testReservedKeysKernel(uint64_t* keys, bool* results,
                                       size_t numKeys) {
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < numKeys) {
    results[idx] = IS_RESERVED_KEY(keys[idx]);
  }
}

void testCustomMemsetAsync() {
  size_t numElements = 4;
  uint64_t value = 0xFFFFFFFFFFFFFFF1;
  uint64_t* devPtr;
  uint64_t* hostData = new uint64_t[numElements];

  cudaMalloc((void**)&devPtr, numElements * sizeof(uint64_t));
  memset64Async(devPtr, value, numElements);
  cudaMemcpy(hostData, devPtr, numElements * sizeof(uint64_t),
             cudaMemcpyDeviceToHost);
  for (size_t i = 0; i < numElements; i++) {
    assert(hostData[i] == value);
  }

  std::cout << "All values were set correctly!" << std::endl;

  cudaFree(devPtr);
  delete[] hostData;
}

void testReservedKeys(uint64_t* testKeys, bool* expectedResults,
                      size_t numKeys) {
  uint64_t* d_keys;
  bool* d_results;
  bool* h_results = new bool[numKeys];

  cudaMalloc(&d_keys, numKeys * sizeof(uint64_t));
  cudaMalloc(&d_results, numKeys * sizeof(bool));

  cudaMemcpy(d_keys, testKeys, numKeys * sizeof(uint64_t),
             cudaMemcpyHostToDevice);

  int blockSize = 256;
  int numBlocks = (numKeys + blockSize - 1) / blockSize;

  testReservedKeysKernel<<<numBlocks, blockSize>>>(d_keys, d_results, numKeys);
  cudaDeviceSynchronize();

  cudaMemcpy(h_results, d_results, numKeys * sizeof(bool),
             cudaMemcpyDeviceToHost);

  for (size_t i = 0; i < numKeys; i++) {
    assert(h_results[i] == expectedResults[i]);
  }

  cudaFree(d_keys);
  cudaFree(d_results);
  delete[] h_results;
  CudaCheckError();
  std::cout << "All tests passed." << std::endl;
}

void testKeyOptions() {
  for (int i = 0; i <= MAX_RESERVED_KEY_BIT; i++) {
    CUDA_CHECK(init_reserved_keys(i));
    uint64_t host_reclaim_key, host_locked_key;
    cudaMemcpyFromSymbol(&host_reclaim_key, RECLAIM_KEY, sizeof(uint64_t));
    cudaMemcpyFromSymbol(&host_locked_key, LOCKED_KEY, sizeof(uint64_t));

    uint64_t testKeys[6] = {EMPTY_KEY_CPU, host_reclaim_key, host_locked_key,
                            UINT64_C(0x0), UINT64_C(0x10),   DEFAULT_EMPTY_KEY};
    bool expectedResults[6] = {true,  true,  true,
                               false, false, (i == 0) ? true : false};
    testReservedKeys(testKeys, expectedResults, 4);
  }
}

TEST(ReservedKeysTest, testKeyOptions) {
  testKeyOptions();
  testCustomMemsetAsync();
}

================================================
FILE: tests/save_and_load_test.cc.cu
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <gtest/gtest.h>
#include <stdio.h>
#include "merlin/types.cuh"
#include "merlin_hashtable.cuh"
#include "merlin_localfile.hpp"
#include "test_util.cuh"

constexpr uint64_t DIM = 64;
using K = int64_t;
using S = uint64_t;
using V = float;
using EvictStrategy = nv::merlin::EvictStrategy;
using TableOptions = nv::merlin::HashTableOptions;

template <typename Table>
void test_save_to_file() {
  std::string prefix = "checkpoint";
  size_t keynum = 1 * 1024 * 1024;
  size_t capacity = 2 * 1024 * 1024;
  size_t buffer_size = 1024 * 1024;
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  K* h_keys = nullptr;
  V* h_vectors = nullptr;
  S* h_scores = nullptr;
  CUDA_CHECK(cudaMallocHost(&h_keys, keynum * sizeof(K)));
  CUDA_CHECK(cudaMallocHost(&h_vectors, keynum * sizeof(V) * DIM));
  CUDA_CHECK(cudaMallocHost(&h_scores, keynum * sizeof(S)));
  memset(h_keys, 0, keynum * sizeof(K));
  memset(h_vectors, 0, keynum * sizeof(V) * DIM);
  memset(h_scores, 0, keynum * sizeof(S));
  test_util::create_random_keys<K, S>(h_keys, h_scores, keynum);
  printf("Pass create random keys.\n");

  K* d_keys = nullptr;
  V* d_vectors = nullptr;
  S* d_scores = nullptr;
  test_util::getBufferOnDevice(&d_keys, keynum * sizeof(K), stream);
  test_util::getBufferOnDevice(&d_vectors, keynum * sizeof(V) * DIM, stream);
  test_util::getBufferOnDevice(&d_scores, keynum * sizeof(S), stream);
  CUDA_CHECK(cudaMemcpyAsync(d_keys, h_keys, keynum * sizeof(K),
                             cudaMemcpyHostToDevice, stream));
  CUDA_CHECK(cudaMemcpyAsync(d_vectors, h_vectors, keynum * sizeof(V) * DIM,
                             cudaMemcpyHostToDevice, stream));
  CUDA_CHECK(cudaMemcpyAsync(d_scores, h_scores, keynum * sizeof(S),
                             cudaMemcpyHostToDevice, stream));
  printf("Create buffers.\n");

  TableOptions options;
  options.init_capacity = capacity;
  options.max_capacity = capacity;
  options.dim = DIM;

  std::unique_ptr<Table> table_0 = std::make_unique<Table>();
  std::unique_ptr<Table> table_1 = std::make_unique<Table>();
  table_0->init(options);
  table_1->init(options);
  printf("Init tables.\n");

  S global_epoch = 101;
  S* temp_score = (Table::evict_strategy == EvictStrategy::kLru ||
                   Table::evict_strategy == EvictStrategy::kEpochLru)
                      ? nullptr
                      : d_scores;
  table_0->set_global_epoch(global_epoch);
  table_0->insert_or_assign(keynum, d_keys, d_vectors, temp_score, stream);
  printf("Fill table_0.\n");
  nv::merlin::LocalKVFile<K, V, S> file;
  std::string keys_path = prefix + ".keys";
  std::string values_path = prefix + ".values";
  std::string scores_path = prefix + ".scores";
  file.open(keys_path, values_path, scores_path, "wb");
  table_0->save(&file, buffer_size, stream);
  file.close();
  printf("table_0 saves.\n");
  file.open(keys_path, values_path, scores_path, "rb");
  table_1->load(&file, buffer_size, stream);
  file.close();
  printf("table_1 loads.\n");
  bool check_score = !(Table::evict_strategy == EvictStrategy::kLru ||
                       Table::evict_strategy == EvictStrategy::kEpochLru);
  ASSERT_TRUE((test_util::tables_equal<K, V, S, Table>(
      table_0.get(), table_1.get(), check_score, stream)));
  printf("table_0 and table_1 are equal.\n");
  CUDA_FREE_POINTERS(stream, d_keys, d_vectors, d_scores, h_keys, h_vectors,
                     h_scores);
  CUDA_CHECK(cudaStreamSynchronize(stream));
}

TEST(SaveAndLoadTest, test_save_and_load_on_lru) {
  test_save_to_file<nv::merlin::HashTable<K, V, S, EvictStrategy::kLru>>();
}
TEST(SaveAndLoadTest, test_save_and_load_on_lfu) {
  test_save_to_file<nv::merlin::HashTable<K, V, S, EvictStrategy::kLfu>>();
}
TEST(SaveAndLoadTest, test_save_and_load_on_epochlru) {
  test_save_to_file<nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLru>>();
}
TEST(SaveAndLoadTest, test_save_and_load_on_epochlfu) {
  test_save_to_file<nv::merlin::HashTable<K, V, S, EvictStrategy::kEpochLfu>>();
}
TEST(SaveAndLoadTest, test_save_and_load_on_customized) {
  test_save_to_file<
      nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>>();
}


================================================
FILE: tests/test_util.cuh
================================================
/*
 * Copyright (c) 2022, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <iomanip>
#include <iostream>
#include <random>
#include <string>
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include "merlin/utils.cuh"
#include "merlin_hashtable.cuh"

#define UNEQUAL_EXPR(expr1, expr2)                             \
  {                                                            \
    std::cout << __FILE__ << ":" << __LINE__ << ":Unequal\n"   \
              << "\t\t" << #expr1 << " != " << #expr2 << "\n"; \
  }

#define MERLIN_EXPECT_TRUE(cond, msg)                                    \
  if ((cond) == false) {                                                 \
    fprintf(stderr, "[ERROR] %s at %s : %d\n", msg, __FILE__, __LINE__); \
    exit(-1);                                                            \
  }

namespace test_util {

template <class S>
__global__ void host_nano_kernel(S* d_clk) {
  S mclk;
  asm volatile("mov.u64 %0,%%globaltimer;" : "=l"(mclk));
  *d_clk = mclk;
}

template <class S>
S host_nano(cudaStream_t stream = 0) {
  S h_clk = 0;
  S* d_clk;

  CUDA_CHECK(cudaMalloc((void**)&(d_clk), sizeof(S)));
  host_nano_kernel<S><<<1, 1, 0, stream>>>(d_clk);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  CUDA_CHECK(cudaMemcpy(&h_clk, d_clk, sizeof(S), cudaMemcpyDeviceToHost));
  CUDA_CHECK(cudaFree(d_clk));
  return h_clk;
}

__global__ void all_true(const bool* conds, size_t n, int* nfalse) {
  const size_t stripe =
      (n + gridDim.x - 1) /
      gridDim.x;  // number of elements assigned to each block.
  size_t start = blockIdx.x * stripe + threadIdx.x;
  size_t end = min(start + stripe, n);

  __shared__ int local_nfalse;
  if (threadIdx.x == 0) {
    local_nfalse = 0;
  }
  __syncthreads();

  for (size_t i = start; i < end; i += blockDim.x) {
    if (!conds[i]) {
      atomicAdd(&local_nfalse, 1);
    }
  }
  __syncthreads();
  if (threadIdx.x == 0) {
    atomicAdd(nfalse, local_nfalse);
  }
}

template <typename T>
__global__ void all_equal(T* a, T* b, size_t n, int* ndiff) {
  const size_t stripe =
      (n + gridDim.x - 1) /
      gridDim.x;  // number of elements assigned to each block.
  size_t start = blockIdx.x * stripe + threadIdx.x;
  size_t end = min(start + stripe, n);

  __shared__ int local_ndiff;
  if (threadIdx.x == 0) {
    local_ndiff = 0;
  }
  __syncthreads();

  for (size_t i = start; i < end; i += blockDim.x) {
    if (a[i] != b[i]) {
      atomicAdd(&local_ndiff, 1);
    }
  }
  __syncthreads();
  if (threadIdx.x == 0) {
    atomicAdd(ndiff, local_ndiff);
  }
}

uint64_t getTimestamp() {
  return std::chrono::duration_cast<std::chrono::milliseconds>(
             std::chrono::system_clock::now().time_since_epoch())
      .count();
}

template <class K, class S>
void create_random_keys(K* h_keys, S* h_scores, int KEY_NUM,
                        int freq_range = 1000) {
  std::unordered_set<K> numbers;
  std::random_device rd;
  std::mt19937_64 eng(rd());
  std::uniform_int_distribution<K> distr;
  int i = 0;

  while (numbers.size() < KEY_NUM) {
    numbers.insert(distr(eng));
  }
  for (const K num : numbers) {
    h_keys[i] = num;
    h_scores[i] = num % freq_range;
    i++;
  }
}

template <class K, class S, class V, size_t DIM = 16>
void create_random_keys(K* h_keys, S* h_scores, V* h_vectors, int KEY_NUM,
                        size_t range = std::numeric_limits<uint64_t>::max()) {
  std::unordered_set<K> numbers;
  std::random_device rd;
  std::mt19937_64 eng(rd());
  std::uniform_int_distribution<K> distr;
  int i = 0;

  while (numbers.size() < KEY_NUM) {
    numbers.insert(distr(eng) % range);
  }
  for (const K num : numbers) {
    h_keys[i] = num;
    if (h_scores != nullptr) {
      h_scores[i] = num;
    }
    if (h_vectors != nullptr) {
      for (size_t j = 0; j < DIM; j++) {
        h_vectors[i * DIM + j] = static_cast<float>(num * 0.00001);
      }
    }
    i++;
  }
}

template <class K>
void create_random_bools(bool* bools, int KEY_NUM, float true_ratio = 0.6) {
  std::random_device rd;
  std::mt19937_64 eng(rd());
  std::uniform_int_distribution<K> distr;

  for (int i = 0; i < KEY_NUM; i++) {
    K bound = 1000 * true_ratio;
    bools[i] = (distr(eng) % 1000 < bound);
  }
}

template <class K, class S, class V>
void create_random_keys(size_t dim, K* h_keys, S* h_scores, V* h_vectors,
                        int KEY_NUM,
                        size_t range = std::numeric_limits<uint64_t>::max()) {
  std::unordered_set<K> numbers;
  std::random_device rd;
  std::mt19937_64 eng(rd());
  std::uniform_int_distribution<K> distr;
  int i = 0;

  while (numbers.size() < KEY_NUM) {
    numbers.insert(distr(eng) % range);
  }
  for (const K num : numbers) {
    h_keys[i] = num;
    if (h_scores != nullptr) {
      h_scores[i] = num;
    }
    if (h_vectors != nullptr) {
      for (size_t j = 0; j < dim; j++) {
        h_vectors[i * dim + j] = static_cast<V>(num * 0.00001);
      }
    }
    i++;
  }
}

template <class K, class S, class V>
void create_random_keys_advanced(
    size_t dim, K* h_keys, S* h_scores, V* h_vectors, int KEY_NUM,
    size_t range = std::numeric_limits<uint64_t>::max(), int freq_range = 10) {
  std::unordered_set<K> numbers;
  std::random_device rd;
  std::mt19937_64 eng(rd());
  std::uniform_int_distribution<K> distr;
  int i = 0;

  while (numbers.size() < KEY_NUM) {
    numbers.insert(distr(eng) % range);
  }
  for (const K num : numbers) {
    h_keys[i] = num;
    if (h_scores != nullptr) {
      h_scores[i] = num % freq_range;
    }
    if (h_vectors != nullptr) {
      for (size_t j = 0; j < dim; j++) {
        h_vectors[i * dim + j] = static_cast<float>(num * 0.00001);
      }
    }
    i++;
  }
}

template <class K, class S, class V>
void create_random_keys_advanced(
    size_t dim, K* h_keys, K* pre_h_keys, S* h_scores, V* h_vectors,
    int KEY_NUM, size_t range = std::numeric_limits<uint64_t>::max(),
    int freq_range = 10, float repeat_rate = 0.9) {
  std::unordered_set<K> numbers;
  std::random_device rd;
  std::mt19937_64 eng(rd());
  std::uniform_int_distribution<K> distr;
  std::mt19937_64 eng_switch(rd());
  std::uniform_int_distribution<K> distr_switch;
  int i = 0;
  int pre_pos = 0;

  while (numbers.size() < KEY_NUM) {
    bool repeated = static_cast<K>(distr_switch(eng_switch) % 100000) <
                    static_cast<K>(repeat_rate * 100000);
    if (repeated) {
      numbers.insert(pre_h_keys[pre_pos++]);
    } else {
      numbers.insert(distr(eng) % range);
    }
  }
  for (const K num : numbers) {
    h_keys[i] = num;
    if (h_scores != nullptr) {
      h_scores[i] = num % freq_range;
    }
    if (h_vectors != nullptr) {
      for (size_t j = 0; j < dim; j++) {
        h_vectors[i * dim + j] = static_cast<float>(num * 0.00001);
      }
    }
    i++;
  }
}

inline uint64_t Murmur3HashHost(const uint64_t& key) {
  uint64_t k = key;
  k ^= k >> 33;
  k *= UINT64_C(0xff51afd7ed558ccd);
  k ^= k >> 33;
  k *= UINT64_C(0xc4ceb9fe1a85ec53);
  k ^= k >> 33;
  return k;
}

template <class K, class S, class V, size_t DIM = 16>
void create_continuous_keys(K* h_keys, S* h_scores, V* h_vectors, int KEY_NUM,
                            K start = 1) {
  for (K i = 0; i < KEY_NUM; i++) {
    h_keys[i] = start + static_cast<K>(i);
    h_scores[i] = h_keys[i];
    if (h_vectors != nullptr) {
      for (size_t j = 0; j < DIM; j++) {
        h_vectors[i * DIM + j] = static_cast<V>(h_keys[i] * 0.00001);
      }
    }
  }
}

template <class K, class S, class V, size_t DIM = 16>
void create_keys_in_one_buckets(K* h_keys, S* h_scores, V* h_vectors,
                                int KEY_NUM, int capacity,
                                int bucket_max_size = 128, int bucket_idx = 0,
                                K min = 0,
                                K max = static_cast<K>(0xFFFFFFFFFFFFFFFD)) {
  std::unordered_set<K> numbers;
  std::random_device rd;
  std::mt19937_64 eng(rd());
  std::uniform_int_distribution<K> distr;
  K candidate;
  K hashed_key;
  size_t global_idx;
  size_t bkt_idx;
  int i = 0;

  while (numbers.size() < KEY_NUM) {
    candidate = (distr(eng) % (max - min)) + min;
    hashed_key = Murmur3HashHost(candidate);
    global_idx = hashed_key & (capacity - 1);
    bkt_idx = global_idx / bucket_max_size;
    if (bkt_idx == bucket_idx) {
      numbers.insert(candidate);
    }
  }
  for (const K num : numbers) {
    h_keys[i] = num;
    if (h_scores != nullptr) {
      h_scores[i] = num;
    }
    for (size_t j = 0; j < DIM; j++) {
      *(h_vectors + i * DIM + j) = static_cast<float>(num * 0.00001);
    }
    i++;
  }
}

template <class K, class S, class V, size_t DIM = 16>
void create_keys_in_one_buckets_lfu(K* h_keys, S* h_scores, V* h_vectors,
                                    int KEY_NUM, int capacity,
                                    int bucket_max_size = 128,
                                    int bucket_idx = 0, K min = 0,
                                    K max = static_cast<K>(0xFFFFFFFFFFFFFFFD),
                                    int freq_range = 1000) {
  std::unordered_set<K> numbers;
  std::random_device rd;
  std::mt19937_64 eng(rd());
  std::uniform_int_distribution<K> distr;
  K candidate;
  K hashed_key;
  size_t global_idx;
  size_t bkt_idx;
  int i = 0;

  while (numbers.size() < KEY_NUM) {
    candidate = (distr(eng) % (max - min)) + min;
    hashed_key = Murmur3HashHost(candidate);
    global_idx = hashed_key & (capacity - 1);
    bkt_idx = global_idx / bucket_max_size;
    if (bkt_idx == bucket_idx) {
      numbers.insert(candidate);
    }
  }
  for (const K num : numbers) {
    h_keys[i] = num;
    if (h_scores != nullptr) {
      h_scores[i] = num % freq_range;
    }
    for (size_t j = 0; j < DIM; j++) {
      *(h_vectors + i * DIM + j) = static_cast<float>(num * 0.00001);
    }
    i++;
  }
}

template <class S>
S make_expected_score_for_epochlfu(S global_epoch, S original_score) {
  bool if_overflow = (original_score >= static_cast<S>(0xFFFFFFFF));
  return ((global_epoch << 32) | (if_overflow ? (static_cast<S>(0xFFFFFFFF))
                                              : original_score & 0xFFFFFFFF));
}

template <typename T>
void getBufferOnDevice(T** ptr, size_t size, cudaStream_t stream) {
  MERLIN_EXPECT_TRUE((*ptr == nullptr), "Pointer is already assigned.");
  CUDA_CHECK(cudaMallocAsync(ptr, size, stream));
  CUDA_CHECK(cudaMemsetAsync(*ptr, 0, size, stream));
}

void freeBufferOnDevice(void* ptr, cudaStream_t stream) {
  CUDA_CHECK(cudaFreeAsync(ptr, stream));
  ptr = nullptr;
}

template <typename T, size_t DIM>
struct ValueArray {
 public:
  T data[DIM];

  __host__ __device__ T sum() {
    T s = 0;
    for (size_t i = 0; i < DIM; i++) {
      s += data[i];
    }
  }

  __host__ __device__ T operator[](size_t i) { return data[i]; }
};

template <typename T>
struct HostAndDeviceBuffer {
 public:
  void Alloc(size_t n, cudaStream_t stream = 0) {
    if (d_data) {
      CUDA_FREE_POINTERS(stream, d_data);
    }
    if (h_data) {
      free(h_data);
      h_data = nullptr;
    }
    if (d_data) {
      CUDA_CHECK(cudaStreamSynchronize(stream));
      d_data = nullptr;
    }
    getBufferOnDevice(&d_data, n * sizeof(T), stream);
    h_data = (T*)malloc(n * sizeof(T));
    size_ = n;
  }

  ~HostAndDeviceBuffer() {
    CUDA_CHECK(cudaDeviceSynchronize());
    Free();
    CUDA_CHECK(cudaDeviceSynchronize());
  }

  void Free(cudaStream_t stream = 0) {
    if (d_data) {
      CUDA_FREE_POINTERS(stream, d_data);
    }
    if (h_data) {
      free(h_data);
      h_data = nullptr;
    }
    if (d_data) {
      CUDA_CHECK(cudaStreamSynchronize(stream));
      d_data = nullptr;
    }
    size_ = 0;
  }

  void SetFromHost(const T* data, size_t n, cudaStream_t stream = 0) {
    CUDA_CHECK(cudaMemcpyAsync(d_data, data, n * sizeof(T),
                               cudaMemcpyHostToDevice, stream));
    memcpy(h_data, data, n * sizeof(T));
  }

  void SetFromDevice(const T* data, size_t n, cudaStream_t stream = 0) {
    CUDA_CHECK(cudaMemcpyAsync(d_data, data, n * sizeof(T),
                               cudaMemcpyDeviceToDevice, stream));
    CUDA_CHECK(cudaMemcpyAsync(h_data, data, n * sizeof(T),
                               cudaMemcpyDeviceToHost, stream));
  }

  bool SetValueInRange(T start, T skip, size_t stripe,
                       cudaStream_t stream = 0) {
    if (!h_data || skip == 0 || stripe == 0 || size_ % stripe != 0) {
      return false;
    }

    size_t n_stripe = size_ / stripe;
    for (size_t i = 0; i < n_stripe; i++) {
      T value = start + static_cast<T>(i) * skip;
      for (size_t j = 0; j < stripe; j++) {
        h_data[i * stripe + j] = value;
      }
    }
    CUDA_CHECK(cudaMemcpyAsync(d_data, h_data, size_ * sizeof(T),
                               cudaMemcpyHostToDevice, stream));
    return true;
  }

  void ToZeros(cudaStream_t stream = 0) {
    CUDA_CHECK(cudaMemsetAsync(d_data, 0, size_ * sizeof(T), stream));
    memset(h_data, 0, size_ * sizeof(T));
  }

  void ToConst(const T val, cudaStream_t stream) {
    for (size_t i = 0; i < size_; i++) {
      h_data[i] = val;
    }
    CUDA_CHECK(cudaMemcpyAsync(d_data, h_data, size_ * sizeof(T),
                               cudaMemcpyHostToDevice, stream));
  }

  void SyncData(bool h2d, cudaStream_t stream = 0) {
    if (h2d) {
      CUDA_CHECK(cudaMemcpyAsync(d_data, h_data, size_ * sizeof(T),
                                 cudaMemcpyHostToDevice, stream));
    } else {
      CUDA_CHECK(cudaMemcpyAsync(h_data, d_data, size_ * sizeof(T),
                                 cudaMemcpyDeviceToHost, stream));
    }
  }

 public:
  T* h_data = nullptr;
  T* d_data = nullptr;
  size_t size_ = 0;
};

template <typename K, typename V, typename S>
struct KVMSBuffer {
 public:
  KVMSBuffer() : len_(0), dim_(0) {}

  void Reserve(size_t n, size_t dim, cudaStream_t stream = 0) {
    keys.Alloc(n, stream);
    values.Alloc(n * dim, stream);
    scores.Alloc(n, stream);
    status.Alloc(n, stream);
    len_ = n;
    dim_ = dim;
  }

  ~KVMSBuffer() {
    CUDA_CHECK(cudaDeviceSynchronize());
    Free();
    CUDA_CHECK(cudaDeviceSynchronize());
  }

  void Free(cudaStream_t stream = 0) {
    keys.Free(stream);
    values.Free(stream);
    scores.Free(stream);
    status.Free(stream);
    len_ = 0;
  }

  size_t len() const { return len_; }
  size_t dim() const { return dim_; }

  void ToRange(size_t start, size_t skip = 1, cudaStream_t stream = 0) {
    keys.SetValueInRange(static_cast<K>(start), static_cast<K>(skip), 1,
                         stream);
    values.SetValueInRange(static_cast<V>(start), static_cast<V>(skip), dim_,
                           stream);
    status.ToZeros(stream);
  }

  void ToZeros(cudaStream_t stream) {
    keys.ToZeros(stream);
    values.ToZeros(stream);
    scores.ToZeros(stream);
    status.ToZeros(stream);
  }

  void Setscore(const S score, cudaStream_t stream) {
    scores.ToConst(score, stream);
  }

  K* keys_ptr(bool on_device = true) {
    if (on_device) {
      return keys.d_data;
    }
    return keys.h_data;
  }

  V* values_ptr(bool on_device = true) {
    if (on_device) {
      return values.d_data;
    }
    return values.h_data;
  }

  S* scores_ptr(bool on_device = true) {
    if (on_device) {
      return scores.d_data;
    }
    return scores.h_data;
  }

  bool* status_ptr(bool on_device = true) {
    if (on_device) {
      return status.d_data;
    }
    return status.h_data;
  }

  void SyncData(bool h2d, cudaStream_t stream = 0) {
    keys.SyncData(h2d, stream);
    values.SyncData(h2d, stream);
    scores.SyncData(h2d, stream);
    status.SyncData(h2d, stream);
  }

  void CopyFrom(KVMSBuffer<K, V, S>& src, cudaStream_t stream = 0) {
    memcpy(keys_ptr(false), src.keys_ptr(false), sizeof(K) * len());
    memcpy(scores_ptr(false), src.scores_ptr(false), sizeof(S) * len());
    memcpy(values_ptr(false), src.values_ptr(false), sizeof(V) * len() * dim());
    keys.SyncData(true, stream);
    values.SyncData(true, stream);
    scores.SyncData(true, stream);
    status.SyncData(true, stream);
  }

  void CopyFromByRate(KVMSBuffer<K, V, S>& src, float repeat_rate,
                      cudaStream_t stream = 0) {
    memcpy(keys_ptr(false), src.keys_ptr(false), sizeof(K) * len());
    memcpy(scores_ptr(false), src.scores_ptr(false), sizeof(S) * len());
    memcpy(values_ptr(false), src.values_ptr(false), sizeof(V) * len() * dim());
    keys.SyncData(true, stream);
    values.SyncData(true, stream);
    scores.SyncData(true, stream);
    status.SyncData(true, stream);
  }

 public:
  HostAndDeviceBuffer<K> keys;
  HostAndDeviceBuffer<V> values;
  HostAndDeviceBuffer<S> scores;
  HostAndDeviceBuffer<bool> status;
  size_t dim_;
  size_t len_;
};

bool allTrueGpu(const bool* conds, size_t n, cudaStream_t stream) {
  int nfalse = 0;
  int* d_nfalse = nullptr;
  getBufferOnDevice(&d_nfalse, sizeof(int), stream);
  int block_size = 128;
  int grid_size = (n + block_size - 1) / block_size;
  all_true<<<grid_size, block_size, 0, stream>>>(conds, n, d_nfalse);
  CUDA_CHECK(cudaMemcpyAsync(&nfalse, d_nfalse, sizeof(int),
                             cudaMemcpyDeviceToHost, stream));
  cudaStreamSynchronize(stream);
  freeBufferOnDevice(d_nfalse, stream);
  cudaStreamSynchronize(stream);
  return nfalse == 0;
}

template <typename T>
bool allEqualGpu(T* a, T* b, size_t n, cudaStream_t stream) {
  int ndiff = 0;
  int* d_ndiff = nullptr;
  getBufferOnDevice(&d_ndiff, sizeof(int), stream);
  int block_size = 128;
  int grid_size = (n + block_size - 1) / block_size;
  all_equal<<<grid_size, block_size, 0, stream>>>(a, b, n, d_ndiff);
  CUDA_CHECK(cudaMemcpyAsync(&ndiff, d_ndiff, sizeof(int),
                             cudaMemcpyDeviceToHost, stream));
  freeBufferOnDevice(d_ndiff, stream);
  cudaStreamSynchronize(stream);
  return ndiff == 0;
}

template <typename K, typename V, typename S, typename Table>
bool tables_equal(Table* a, Table* b, bool check_score, cudaStream_t stream) {
  size_t size = a->size(stream);
  if (size != b->size(stream)) {
    return false;
  }

  if (a->dim() != b->dim()) {
    return false;
  }

  size_t* d_size = nullptr;
  K* d_keys = nullptr;
  V* d_vectors = nullptr;
  S* d_scores = nullptr;
  bool* d_founds_in_b = nullptr;
  V* d_vectors_in_b = nullptr;
  S* d_scores_in_b = nullptr;

  getBufferOnDevice(&d_size, sizeof(size_t), stream);
  getBufferOnDevice(&d_keys, sizeof(K) * size, stream);
  getBufferOnDevice(&d_vectors, sizeof(V) * size * a->dim(), stream);
  getBufferOnDevice(&d_scores, sizeof(S) * size, stream);
  getBufferOnDevice(&d_founds_in_b, sizeof(bool) * size, stream);
  getBufferOnDevice(&d_vectors_in_b, sizeof(V) * size * a->dim(), stream);
  getBufferOnDevice(&d_scores_in_b, sizeof(S) * size, stream);

  a->export_batch(a->capacity(), 0, d_size, d_keys, d_vectors, d_scores,
                  stream);
  b->find(size, d_keys, d_vectors_in_b, d_founds_in_b, d_scores_in_b, stream);
  if (!allTrueGpu(d_founds_in_b, size, stream)) {
    CUDA_FREE_POINTERS(stream, d_size, d_keys, d_vectors, d_scores,
                       d_founds_in_b, d_vectors_in_b, d_scores_in_b);
    return false;
  }
  if (check_score && !allEqualGpu<S>(d_scores, d_scores_in_b, size, stream)) {
    CUDA_FREE_POINTERS(stream, d_size, d_keys, d_vectors, d_scores,
                       d_founds_in_b, d_vectors_in_b, d_scores_in_b);
    return false;
  }
  if (!allEqualGpu(d_vectors, d_vectors_in_b, size * a->dim(), stream)) {
    CUDA_FREE_POINTERS(stream, d_size, d_keys, d_vectors, d_scores,
                       d_founds_in_b, d_vectors_in_b, d_scores_in_b);
    return false;
  }
  return true;
}

template <typename T, std::size_t N>
std::array<T, N> range(const T start) {
  std::array<T, N> result;
  size_t i = 0;
  while (i < N) {
    result[i] = start + i;
    i++;
  }
  return result;
}

template <class T>
class HostBuffer {
 public:
  HostBuffer(const size_t size = 1) : ptr_(nullptr) {
    if (!ptr_) {
      size_ = size;
      ptr_ = reinterpret_cast<T*>(malloc(sizeof(T) * size_));
    }
  }
  ~HostBuffer() {
    try {
      if (!ptr_) free(ptr_);
    } catch (const nv::merlin::CudaException& e) {
      cerr << "[HierarchicalKV] Failed to free HostBuffer!" << endl;
    }
  }

  __inline__ T* alloc_or_reuse(const size_t size = 0) {
    if (size > size_) {
      free(ptr_);
      size_ = size;
      reinterpret_cast<T*>(malloc(sizeof(T) * size_));
    }
    return ptr_;
  }

  __inline__ T* ptr() { return ptr_; }

 private:
  T* ptr_;
  size_t size_;
};

template <class V>
__global__ void read_from_ptr_kernel(const V* const* __restrict src,
                                     V* __restrict dst, const size_t dim,
                                     size_t N) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;

  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
    int vec_index = int(t / dim);
    int dim_index = t % dim;
    if (src[vec_index]) {
      dst[vec_index * dim + dim_index] = src[vec_index][dim_index];
    }
  }
}

template <class V>
void read_from_ptr(const V* const* __restrict src, V* __restrict dst,
                   const size_t dim, size_t n, cudaStream_t stream) {
  const size_t block_size = 1024;
  const size_t N = n * dim;
  const size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(N, block_size);

  read_from_ptr_kernel<V>
      <<<grid_size, block_size, 0, stream>>>(src, dst, dim, N);
}

template <class V>
__global__ void array2ptr_kernel(V** ptr, V* __restrict array, const size_t dim,
                                 size_t N) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;

  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
    int vec_index = int(t);
    ptr[vec_index] = array + vec_index * dim;
  }
}

template <class V>
void array2ptr(V** ptr, V* __restrict array, const size_t dim, size_t n,
               cudaStream_t stream) {
  const size_t block_size = 1024;
  const size_t N = n;
  const size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(N, block_size);

  array2ptr_kernel<V><<<grid_size, block_size, 0, stream>>>(ptr, array, dim, N);
}

template <class V>
__global__ void read_or_write_ptr_kernel(V** __restrict src, V* __restrict dst,
                                         bool* read_or_write, const size_t dim,
                                         size_t N) {
  size_t tid = (blockIdx.x * blockDim.x) + threadIdx.x;

  for (size_t t = tid; t < N; t += blockDim.x * gridDim.x) {
    int vec_index = int(t / dim);
    int dim_index = t % dim;
    if (!src[vec_index]) continue;
    if (read_or_write[vec_index]) {
      dst[vec_index * dim + dim_index] = src[vec_index][dim_index];
    } else {
      src[vec_index][dim_index] = dst[vec_index * dim + dim_index];
    }
  }
}

template <class V>
void read_or_write_ptr(V** __restrict src, V* __restrict dst,
                       bool* read_or_write, const size_t dim, size_t n,
                       cudaStream_t stream) {
  const size_t block_size = 1024;
  const size_t N = n * dim;
  const size_t grid_size = nv::merlin::SAFE_GET_GRID_SIZE(N, block_size);

  read_or_write_ptr_kernel<V>
      <<<grid_size, block_size, 0, stream>>>(src, dst, read_or_write, dim, N);
}

}  // namespace test_util


================================================
FILE: tests/uint32_score_test.cc.cu
================================================
#include <gtest/gtest.h>
#include <algorithm>
#include <cstdint>
#include <iostream>
#include <limits>
#include <memory>
#include <unordered_map>
#include "merlin_hashtable.cuh"
#include "test_util.cuh"

constexpr size_t DIM = 8;
constexpr uint64_t CAPACITY = 1024;
constexpr uint64_t KEY_NUM = 256;

using K = uint64_t;
using V = float;
using S = uint32_t;
using TableOptions = nv::merlin::HashTableOptions;
using EvictStrategy = nv::merlin::EvictStrategy;
using Table = nv::merlin::HashTable<K, V, S, EvictStrategy::kCustomized>;

namespace {

TableOptions default_options() {
  TableOptions options;
  options.init_capacity = CAPACITY;
  options.max_capacity = CAPACITY;
  options.dim = DIM;
  options.max_bucket_size = 128;
  options.max_hbm_for_vectors = nv::merlin::GB(1);
  return options;
}

void fill_sequential(test_util::KVMSBuffer<K, V, S>& buffer) {
  for (size_t i = 0; i < buffer.len(); ++i) {
    K key = static_cast<K>(i + 1);
    buffer.keys.h_data[i] = key;
    buffer.scores.h_data[i] = static_cast<S>(key);
    for (size_t j = 0; j < buffer.dim(); ++j) {
      buffer.values.h_data[i * buffer.dim() + j] =
          static_cast<V>(key * 0.00001f);
    }
  }
}

}  // namespace

TEST(Uint32ScoreTest, FindOrInsertAndFind) {
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(default_options());

  test_util::KVMSBuffer<K, V, S> input;
  input.Reserve(KEY_NUM, DIM, stream);
  fill_sequential(input);
  input.SyncData(true, stream);

  table->find_or_insert(KEY_NUM, input.keys_ptr(), input.values_ptr(),
                        input.scores_ptr(), stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  size_t size = table->size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(size, KEY_NUM);

  test_util::KVMSBuffer<K, V, S> output;
  output.Reserve(KEY_NUM, DIM, stream);
  output.ToZeros(stream);

  table->find(KEY_NUM, input.keys_ptr(), output.values_ptr(),
              output.status_ptr(), output.scores_ptr(), stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  output.SyncData(false, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < KEY_NUM; ++i) {
    ASSERT_TRUE(output.status.h_data[i]);
    ASSERT_EQ(output.scores.h_data[i], input.scores.h_data[i]);
    for (size_t j = 0; j < DIM; ++j) {
      ASSERT_EQ(output.values.h_data[i * DIM + j],
                input.values.h_data[i * DIM + j]);
    }
  }

  constexpr size_t MISSING_NUM = 16;
  test_util::KVMSBuffer<K, V, S> missing;
  missing.Reserve(MISSING_NUM, DIM, stream);
  missing.ToZeros(stream);
  for (size_t i = 0; i < MISSING_NUM; ++i) {
    missing.keys.h_data[i] = static_cast<K>(KEY_NUM + 1000 + i);
  }
  missing.SyncData(true, stream);

  table->find(MISSING_NUM, missing.keys_ptr(), missing.values_ptr(),
              missing.status_ptr(), missing.scores_ptr(), stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  missing.SyncData(false, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < MISSING_NUM; ++i) {
    ASSERT_FALSE(missing.status.h_data[i]);
  }

  CUDA_CHECK(cudaStreamDestroy(stream));
  CudaCheckError();
}

TEST(Uint32ScoreTest, AssignScoresAndExport) {
  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(default_options());

  test_util::KVMSBuffer<K, V, S> input;
  input.Reserve(KEY_NUM, DIM, stream);
  fill_sequential(input);
  input.SyncData(true, stream);

  table->find_or_insert(KEY_NUM, input.keys_ptr(), input.values_ptr(),
                        input.scores_ptr(), stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < KEY_NUM; ++i) {
    input.scores.h_data[i] = static_cast<S>(1000 + i);
  }
  input.scores.h_data[0] = static_cast<S>(0);
  input.scores.h_data[1] = std::numeric_limits<S>::max();
  input.scores.h_data[2] = static_cast<S>(1);
  input.scores.h_data[3] = std::numeric_limits<S>::max() - 1;
  input.scores.SyncData(true, stream);

  table->assign_scores(KEY_NUM, input.keys_ptr(), input.scores_ptr(), stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  test_util::KVMSBuffer<K, V, S> verify;
  verify.Reserve(KEY_NUM, DIM, stream);
  verify.ToZeros(stream);

  table->find(KEY_NUM, input.keys_ptr(), verify.values_ptr(),
              verify.status_ptr(), verify.scores_ptr(), stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  verify.SyncData(false, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  for (size_t i = 0; i < KEY_NUM; ++i) {
    ASSERT_TRUE(verify.status.h_data[i]);
    ASSERT_EQ(verify.scores.h_data[i], input.scores.h_data[i]);
  }

  const size_t capacity = table->capacity();
  test_util::KVMSBuffer<K, V, S> exported;
  exported.Reserve(capacity, DIM, stream);
  exported.ToZeros(stream);

  size_t dumped =
      table->export_batch(capacity, 0, exported.keys_ptr(),
                          exported.values_ptr(), exported.scores_ptr(), stream);
  ASSERT_EQ(dumped, KEY_NUM);

  exported.SyncData(false, stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));

  std::unordered_map<K, S> expected_scores;
  expected_scores.reserve(KEY_NUM);
  for (size_t i = 0; i < KEY_NUM; ++i) {
    expected_scores.emplace(static_cast<K>(i + 1), input.scores.h_data[i]);
  }

  for (size_t i = 0; i < dumped; ++i) {
    K key = exported.keys.h_data[i];
    auto it = expected_scores.find(key);
    ASSERT_NE(it, expected_scores.end());
    ASSERT_EQ(exported.scores.h_data[i], it->second);
    expected_scores.erase(it);
    for (size_t j = 0; j < DIM; ++j) {
      ASSERT_EQ(exported.values.h_data[i * DIM + j],
                static_cast<V>(key * 0.00001f));
    }
  }
  ASSERT_TRUE(expected_scores.empty());

  CUDA_CHECK(cudaStreamDestroy(stream));
  CudaCheckError();
}

TEST(Uint32ScoreTest, EvictCustomizedCorrectRateFull) {
  constexpr uint64_t BATCH_SIZE = 1024 * 1024ul;
  constexpr uint64_t STEPS = 128;
  constexpr uint64_t MAX_BUCKET_SIZE = 128;
  constexpr uint64_t INIT_CAPACITY = BATCH_SIZE * STEPS;
  constexpr uint64_t MAX_CAPACITY = INIT_CAPACITY;
  constexpr float EXPECTED_CORRECT_RATE = 0.964f;
  const int rounds = 6;

  TableOptions options;
  options.init_capacity = INIT_CAPACITY;
  options.max_capacity = MAX_CAPACITY;
  options.dim = DIM;
  options.reserved_key_start_bit = 17;
  options.num_of_buckets_per_alloc = 128;
  options.max_bucket_size = MAX_BUCKET_SIZE;
  options.max_hbm_for_vectors = nv::merlin::GB(16);

  K* h_keys_base = test_util::HostBuffer<K>(BATCH_SIZE).ptr();
  S* h_scores_base = test_util::HostBuffer<S>(BATCH_SIZE).ptr();
  V* h_vectors_base = test_util::HostBuffer<V>(BATCH_SIZE * options.dim).ptr();

  K* h_keys_temp = test_util::HostBuffer<K>(MAX_CAPACITY).ptr();
  S* h_scores_temp = test_util::HostBuffer<S>(MAX_CAPACITY).ptr();
  V* h_vectors_temp =
      test_util::HostBuffer<V>(MAX_CAPACITY * options.dim).ptr();

  K* d_keys_temp = nullptr;
  S* d_scores_temp = nullptr;
  V* d_vectors_temp = nullptr;

  CUDA_CHECK(cudaMalloc(&d_keys_temp, MAX_CAPACITY * sizeof(K)));
  CUDA_CHECK(cudaMalloc(&d_scores_temp, MAX_CAPACITY * sizeof(S)));
  CUDA_CHECK(
      cudaMalloc(&d_vectors_temp, MAX_CAPACITY * sizeof(V) * options.dim));

  cudaStream_t stream;
  CUDA_CHECK(cudaStreamCreate(&stream));

  std::unique_ptr<Table> table = std::make_unique<Table>();
  table->init(options);

  size_t total_size = table->size(stream);
  CUDA_CHECK(cudaStreamSynchronize(stream));
  ASSERT_EQ(total_size, 0);

  size_t global_start_key = 100000;
  size_t start_key = global_start_key;

  for (int r = 0; r < rounds; ++r) {
    const K expected_min_key =
        static_cast<K>(global_start_key + INIT_CAPACITY * r);
    const K expected_max_key =
        static_cast<K>(global_start_key + INIT_CAPACITY * (r + 1) - 1);
    const size_t expected_table_size =
        (r == 0) ? static_cast<size_t>(EXPECTED_CORRECT_RATE * INIT_CAPACITY)
                 : INIT_CAPACITY;

    for (int s = 0; s < STEPS; ++s) {
      test_util::create_continuous_keys<K, S, V, DIM>(
          h_keys_base, h_scores_base, h_vectors_base, BATCH_SIZE, start_key);
      start_key += BATCH_SIZE;

      CUDA_CHECK(cudaMemcpy(d_keys_temp, h_keys_base, BATCH_SIZE * sizeof(K),
                            cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_scores_temp, h_scores_base,
                            BATCH_SIZE * sizeof(S), cudaMemcpyHostToDevice));
      CUDA_CHECK(cudaMemcpy(d_vectors_temp, h_vectors_base,
                            BATCH_SIZE * sizeof(V) * options.dim,
                            cudaMemcpyHostToDevice));
      table->insert_or_assign(BATCH_SIZE, d_keys_temp, d_vectors_temp,
                              d_scores_temp, stream);
      CUDA_CHECK(cudaStreamSynchronize(stream));
    }

    total_size = table->size(stream);
    CUDA_CHECK(cudaStreamSynchronize(stream));
    ASSERT_GE(total_size, expected_table_size);
    ASSERT_EQ(MAX_CAPACITY, table->capacity());

    size_t dump_counter = table->export_batch(
        MAX_CAPACITY, 0, d_keys_temp, d_vectors_temp, d_scores_temp, stream);

    CUDA_CHECK(cudaMemcpy(h_keys_temp, d_keys_temp, MAX_CAPACITY * sizeof(K),
                          cudaMemcpyDefault));
    CUDA_CHECK(cudaMemcpy(h_scores_temp, d_scores_temp,
                          MAX_CAPACITY * sizeof(S), cudaMemcpyDefault));
    CUDA_CHECK(cudaMemcpy(h_vectors_temp, d_vectors_temp,
                          MAX_CAPACITY * sizeof(V) * options.dim,
                          cudaMemcpyDefault));

    ASSERT_EQ(total_size, dump_counter);
    size_t bigger_score_counter = 0;
    K max_key = 0;

    for (size_t i = 0; i < dump_counter; ++i) {
      ASSERT_EQ(h_scores_temp[i], static_cast<S>(h_keys_temp[i]));
      max_key = std::max(max_key, h_keys_temp[i]);
      if (h_scores_temp[i] >= static_cast<S>(expected_min_key)) {
        bigger_score_counter++;
      }
      for (size_t j = 0; j < options.dim; ++j) {
        const V expected = static_cast<V>(h_keys_temp[i] * 0.00001);
        ASSERT_EQ(h_vectors_temp[i * options.dim + j], expected);
      }
    }

    float correct_rate =
        (bigger_score_counter * 1.0f) / static_cast<float>(MAX_CAPACITY);
    std::cout << "[Round " << r << "] "
              << "correct_rate=" << correct_rate << std::endl;
    ASSERT_GE(max_key, expected_max_key);
    ASSERT_GE(correct_rate, EXPECTED_CORRECT_RATE);
  }

  CUDA_CHECK(cudaStreamDestroy(stream));

  CUDA_CHECK(cudaFree(d_keys_temp));
  CUDA_CHECK(cudaFree(d_scores_temp));
  CUDA_CHECK(cudaFree(d_vectors_temp));

  CUDA_CHECK(cudaDeviceSynchronize());

  CudaCheckError();
}