Full Code of efficient/SuRF for AI

master f2ec6e1b53e9 cached

60 files

2.7 MB

702.7k tokens

376 symbols

1 requests

Download .txt

Showing preview only (2,811K chars total). Download the full file or copy to clipboard to get everything.

Repository: efficient/SuRF
Branch: master
Commit: f2ec6e1b53e9
Files: 60
Total size: 2.7 MB

Directory structure:
gitextract_dyvduj7s/

├── .gitignore
├── .gitmodules
├── .travis.yml
├── CMakeLists.txt
├── CodeCoverage.cmake
├── LICENSE
├── README.md
├── bench/
│   ├── CMakeLists.txt
│   ├── MurmurHash3.h
│   ├── bench.hpp
│   ├── bloom.hpp
│   ├── filter.hpp
│   ├── filter_bloom.hpp
│   ├── filter_factory.hpp
│   ├── filter_surf.hpp
│   ├── run.sh
│   ├── workload.cpp
│   ├── workload_arf.cpp
│   ├── workload_gen/
│   │   ├── gen_load.py
│   │   ├── gen_txn.py
│   │   ├── gen_workload.sh
│   │   ├── workload_spec/
│   │   │   ├── workload_template
│   │   │   ├── workloadc_email_latest
│   │   │   ├── workloadc_email_uniform
│   │   │   ├── workloadc_email_zipfian
│   │   │   ├── workloadc_randint_latest
│   │   │   ├── workloadc_randint_uniform
│   │   │   └── workloadc_randint_zipfian
│   │   └── ycsb_download.sh
│   └── workload_multi_thread.cpp
├── include/
│   ├── bitvector.hpp
│   ├── config.hpp
│   ├── hash.hpp
│   ├── label_vector.hpp
│   ├── louds_dense.hpp
│   ├── louds_sparse.hpp
│   ├── popcount.h
│   ├── rank.hpp
│   ├── select.hpp
│   ├── suffix.hpp
│   ├── surf.hpp
│   └── surf_builder.hpp
├── simple_example.cpp
├── src/
│   └── CMakeLists.txt
└── test/
    ├── CMakeLists.txt
    ├── unitTest/
    │   ├── CMakeLists.txt
    │   ├── test_bitvector.cpp
    │   ├── test_label_vector.cpp
    │   ├── test_louds_dense.cpp
    │   ├── test_louds_dense_small.cpp
    │   ├── test_louds_sparse.cpp
    │   ├── test_louds_sparse_small.cpp
    │   ├── test_rank.cpp
    │   ├── test_select.cpp
    │   ├── test_suffix.cpp
    │   ├── test_suffix_vector.cpp
    │   ├── test_surf.cpp
    │   ├── test_surf_builder.cpp
    │   └── test_surf_small.cpp
    └── words.txt

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Prerequisites
*.d

# Compiled Object files
*.slo
*.lo
*.o
*.obj

# Precompiled Headers
*.gch
*.pch

# Compiled Dynamic libraries
*.so
*.dylib
*.dll

# Fortran module files
*.mod
*.smod

# Compiled Static libraries
*.lai
*.la
*.a
*.lib

# Executables
*.exe
*.out
*.app


================================================
FILE: .gitmodules
================================================
[submodule "ARF"]
	path = ARF
	url = https://github.com/efficient/ARF.git
	branch = master


================================================
FILE: .travis.yml
================================================
language: cpp
sudo: required
dist: xenial
compiler: gcc

install:
- sudo apt-get install build-essential
- sudo apt-get install cmake
- sudo apt-get install libgtest.dev
- cd /usr/src/gtest
- sudo cmake CMakeLists.txt
- sudo make
- sudo cp *.a /usr/lib
- sudo apt-get install lcov
- sudo apt-get install ruby
- sudo gem install coveralls-lcov

script:
- cd $TRAVIS_BUILD_DIR
- mkdir build
- cd build
- cmake -DCMAKE_BUILD_TYPE=Debug -DCOVERALLS=ON ..
- make -j
- make coverage

after_success:
- lcov --remove coverage.info 'test/*' '/usr/*' '/lib/*' --output-file coverage.info
- lcov --list coverage.info
- coveralls-lcov --repo-token=${COVERALLS_TOKEN} coverage.info


================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required (VERSION 2.6)
project (SuRF)

message(STATUS "Configuring..." ${CMAKE_PROJECT_NAME})

if (NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE "Release")
endif()

set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g -Wall -mpopcnt -pthread -std=c++11")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -Wall -Werror -mpopcnt -pthread -std=c++11")

option(COVERALLS "Generate coveralls data" OFF)

if (COVERALLS)
  include("${CMAKE_CURRENT_SOURCE_DIR}/CodeCoverage.cmake")
  append_coverage_compiler_flags()
  set(COVERAGE_EXCLUDES 'ARF/*' 'bench/*' 'test/*' '/usr/*' '/lib/*')
  setup_target_for_coverage(
    NAME coverage
    EXECUTABLE make test
    )
else()
  add_definitions(-DNDEBUG)
endif()

enable_testing()

include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")

add_subdirectory(test)
add_subdirectory(bench)

#include_directories("${CMAKE_CURRENT_SOURCE_DIR}/ARF/include")
#add_subdirectory(ARF)


================================================
FILE: CodeCoverage.cmake
================================================
# Copyright (c) 2012 - 2017, Lars Bilke
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its contributors
#    may be used to endorse or promote products derived from this software without
#    specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# USAGE:
#
# 1. Copy this file into your cmake modules path.
#
# 2. Add the following line to your CMakeLists.txt:
#      include(CodeCoverage)
#
# 3. Append necessary compiler flags:
#      APPEND_COVERAGE_COMPILER_FLAGS()
#
# 4. If you need to exclude additional directories from the report, specify them
#    using the COVERAGE_EXCLUDES variable before calling SETUP_TARGET_FOR_COVERAGE.
#    Example:
#      set(COVERAGE_EXCLUDES 'dir1/*' 'dir2/*')
#
# 5. Use the functions described below to create a custom make target which
#    runs your test executable and produces a code coverage report.
#
# 6. Build a Debug build:
#      cmake -DCMAKE_BUILD_TYPE=Debug ..
#      make
#      make my_coverage_target
#

include(CMakeParseArguments)

# Check prereqs
find_program( GCOV_PATH gcov )
find_program( LCOV_PATH  NAMES lcov lcov.bat lcov.exe lcov.perl)
find_program( GENHTML_PATH NAMES genhtml genhtml.perl genhtml.bat )
find_program( GCOVR_PATH gcovr PATHS ${CMAKE_SOURCE_DIR}/scripts/test)
find_program( SIMPLE_PYTHON_EXECUTABLE python )

if(NOT GCOV_PATH)
    message(FATAL_ERROR "gcov not found! Aborting...")
endif() # NOT GCOV_PATH

if("${CMAKE_CXX_COMPILER_ID}" MATCHES "(Apple)?[Cc]lang")
    if("${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 3)
        message(FATAL_ERROR "Clang version must be 3.0.0 or greater! Aborting...")
    endif()
elseif(NOT CMAKE_COMPILER_IS_GNUCXX)
    message(FATAL_ERROR "Compiler is not GNU gcc! Aborting...")
endif()

set(COVERAGE_COMPILER_FLAGS "-g -O0 --coverage -fprofile-arcs -ftest-coverage"
    CACHE INTERNAL "")

set(CMAKE_CXX_FLAGS_COVERAGE
    ${COVERAGE_COMPILER_FLAGS}
    CACHE STRING "Flags used by the C++ compiler during coverage builds."
    FORCE )
set(CMAKE_C_FLAGS_COVERAGE
    ${COVERAGE_COMPILER_FLAGS}
    CACHE STRING "Flags used by the C compiler during coverage builds."
    FORCE )
set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
    ""
    CACHE STRING "Flags used for linking binaries during coverage builds."
    FORCE )
set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE
    ""
    CACHE STRING "Flags used by the shared libraries linker during coverage builds."
    FORCE )
mark_as_advanced(
    CMAKE_CXX_FLAGS_COVERAGE
    CMAKE_C_FLAGS_COVERAGE
    CMAKE_EXE_LINKER_FLAGS_COVERAGE
    CMAKE_SHARED_LINKER_FLAGS_COVERAGE )

if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
    message(WARNING "Code coverage results with an optimised (non-Debug) build may be misleading")
endif() # NOT CMAKE_BUILD_TYPE STREQUAL "Debug"

if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
    link_libraries(gcov)
else()
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --coverage")
endif()

# Defines a target for running and collection code coverage information
# Builds dependencies, runs the given executable and outputs reports.
# NOTE! The executable should always have a ZERO as exit code otherwise
# the coverage generation will not complete.
#
# SETUP_TARGET_FOR_COVERAGE(
#     NAME testrunner_coverage                    # New target name
#     EXECUTABLE testrunner -j ${PROCESSOR_COUNT} # Executable in PROJECT_BINARY_DIR
#     DEPENDENCIES testrunner                     # Dependencies to build first
# )
function(SETUP_TARGET_FOR_COVERAGE)

    set(options NONE)
    set(oneValueArgs NAME)
    set(multiValueArgs EXECUTABLE EXECUTABLE_ARGS DEPENDENCIES)
    cmake_parse_arguments(Coverage "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

    if(NOT LCOV_PATH)
        message(FATAL_ERROR "lcov not found! Aborting...")
    endif() # NOT LCOV_PATH

    if(NOT GENHTML_PATH)
        message(FATAL_ERROR "genhtml not found! Aborting...")
    endif() # NOT GENHTML_PATH

    # Setup target
    add_custom_target(${Coverage_NAME}

        # Cleanup lcov
        COMMAND ${LCOV_PATH} --directory . --zerocounters
        # Create baseline to make sure untouched files show up in the report
        COMMAND ${LCOV_PATH} -c -i -d . -o ${Coverage_NAME}.base

        # Run tests
        COMMAND ${Coverage_EXECUTABLE}

        # Capturing lcov counters and generating report
        COMMAND ${LCOV_PATH} --directory . --capture --output-file ${Coverage_NAME}.info
        # add baseline counters
        COMMAND ${LCOV_PATH} -a ${Coverage_NAME}.base -a ${Coverage_NAME}.info --output-file ${Coverage_NAME}.total
        COMMAND ${LCOV_PATH} --remove ${Coverage_NAME}.total ${COVERAGE_EXCLUDES} --output-file ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned
        COMMAND ${GENHTML_PATH} -o ${Coverage_NAME} ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned
        COMMAND ${CMAKE_COMMAND} -E remove ${Coverage_NAME}.base ${Coverage_NAME}.total ${PROJECT_BINARY_DIR}/${Coverage_NAME}.info.cleaned

        WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
        DEPENDS ${Coverage_DEPENDENCIES}
        COMMENT "Resetting code coverage counters to zero.\nProcessing code coverage counters and generating report."
    )
    
    # Show where to find the lcov info report
    add_custom_command(TARGET ${Coverage_NAME} POST_BUILD
        COMMAND ;
        COMMENT "Lcov code coverage info report saved in ${Coverage_NAME}.info."
    )

    # Show info where to find the report
    add_custom_command(TARGET ${Coverage_NAME} POST_BUILD
        COMMAND ;
        COMMENT "Open ./${Coverage_NAME}/index.html in your browser to view the coverage report."
    )

endfunction() # SETUP_TARGET_FOR_COVERAGE

# Defines a target for running and collection code coverage information
# Builds dependencies, runs the given executable and outputs reports.
# NOTE! The executable should always have a ZERO as exit code otherwise
# the coverage generation will not complete.
#
# SETUP_TARGET_FOR_COVERAGE_COBERTURA(
#     NAME ctest_coverage                    # New target name
#     EXECUTABLE ctest -j ${PROCESSOR_COUNT} # Executable in PROJECT_BINARY_DIR
#     DEPENDENCIES executable_target         # Dependencies to build first
# )
function(SETUP_TARGET_FOR_COVERAGE_COBERTURA)

    set(options NONE)
    set(oneValueArgs NAME)
    set(multiValueArgs EXECUTABLE EXECUTABLE_ARGS DEPENDENCIES)
    cmake_parse_arguments(Coverage "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

    if(NOT SIMPLE_PYTHON_EXECUTABLE)
        message(FATAL_ERROR "python not found! Aborting...")
    endif() # NOT SIMPLE_PYTHON_EXECUTABLE

    if(NOT GCOVR_PATH)
        message(FATAL_ERROR "gcovr not found! Aborting...")
    endif() # NOT GCOVR_PATH

    # Combine excludes to several -e arguments
    set(COBERTURA_EXCLUDES "")
    foreach(EXCLUDE ${COVERAGE_EXCLUDES})
        set(COBERTURA_EXCLUDES "-e ${EXCLUDE} ${COBERTURA_EXCLUDES}")
    endforeach()

    add_custom_target(${Coverage_NAME}

        # Run tests
        ${Coverage_EXECUTABLE}

        # Running gcovr
        COMMAND ${GCOVR_PATH} -x -r ${CMAKE_SOURCE_DIR} ${COBERTURA_EXCLUDES}
            -o ${Coverage_NAME}.xml
        WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
        DEPENDS ${Coverage_DEPENDENCIES}
        COMMENT "Running gcovr to produce Cobertura code coverage report."
    )

    # Show info where to find the report
    add_custom_command(TARGET ${Coverage_NAME} POST_BUILD
        COMMAND ;
        COMMENT "Cobertura code coverage report saved in ${Coverage_NAME}.xml."
    )

endfunction() # SETUP_TARGET_FOR_COVERAGE_COBERTURA

function(APPEND_COVERAGE_COMPILER_FLAGS)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COVERAGE_COMPILER_FLAGS}" PARENT_SCOPE)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COVERAGE_COMPILER_FLAGS}" PARENT_SCOPE)
    message(STATUS "Appending code coverage compiler flags: ${COVERAGE_COMPILER_FLAGS}")
endfunction() # APPEND_COVERAGE_COMPILER_FLAGS

================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
[![Coverage Status](https://coveralls.io/repos/github/efficient/SuRF/badge.svg?branch=master)](https://coveralls.io/github/efficient/SuRF?branch=master)          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS


================================================
FILE: README.md
================================================
# Succinct Range Filter (SuRF)
[![Build Status](https://travis-ci.org/efficient/SuRF.svg?branch=master)](https://travis-ci.org/efficient/SuRF)
[![Coverage Status](https://coveralls.io/repos/github/efficient/SuRF/badge.svg?branch=master)](https://coveralls.io/github/efficient/SuRF?branch=master)

**SuRF** is a fast and compact filter that provides exact-match filtering,
range filtering, and approximate range counts. This is the source code for our
[SIGMOD best paper](http://www.cs.cmu.edu/~huanche1/publications/surf_paper.pdf).
We also host a [demo website](https://www.rangefilter.io/).
The RocksDB experiments with SuRF can be found [here](https://github.com/efficient/rocksdb).

## Install Dependencies
    sudo apt-get install build-essential cmake libgtest.dev
    cd /usr/src/gtest
    sudo cmake CMakeLists.txt
    sudo make
    sudo cp *.a /usr/lib

## Build
    git submodule init
    git submodule update
    mkdir build
    cd build
    cmake ..
    make -j

## Simple Example
A simple example can be found [here](https://github.com/efficient/SuRF/blob/master/simple_example.cpp). To run the example:
```
g++ -mpopcnt -std=c++11 simple_example.cpp
./a.out
```
Note that the key list passed to the SuRF constructor must be SORTED.

## Run Unit Tests
    make test

## Benchmark

### Step 1: Download YCSB
    cd bench/workload_gen
    bash ycsb_download.sh

### Step 2: Generate Workloads
    cd bench/workload_gen
    bash gen_workload.sh
You must provide your own email list to generate email-key workloads.

### Step 3: Run Workloads
    cd bench
    bash run.sh
Note that `run.sh` only includes several representative runs.
Refer to `bench/workload.cpp`, `bench/workload_multi_thread.cpp`
and `bench/workload_arf.cpp` for more experiment configurations.

## License
Copyright 2018, Carnegie Mellon University

Licensed under the [Apache License](https://github.com/efficient/SuRF/blob/master/LICENSE).


================================================
FILE: bench/CMakeLists.txt
================================================
add_executable(workload workload.cpp)
target_link_libraries(workload)

add_executable(workload_multi_thread workload_multi_thread.cpp)
target_link_libraries(workload_multi_thread)

#add_executable(workload_arf workload_arf.cpp)
#target_link_libraries(workload_arf ARF)


================================================
FILE: bench/MurmurHash3.h
================================================
//-----------------------------------------------------------------------------
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.

#ifndef _MURMURHASH3_H_
#define _MURMURHASH3_H_

//-----------------------------------------------------------------------------
// Platform-specific functions and macros

// Microsoft Visual Studio

//typedef unsigned char uint8_t;
//typedef unsigned int uint32_t;
//typedef unsigned __int64 uint64_t;

// Other compilers

#include <stdint.h>
#include <stdlib.h>

// Other compilers

#define FORCE_INLINE inline __attribute__((always_inline))

inline uint32_t rotl32 ( uint32_t x, int8_t r )
{
    return (x << r) | (x >> (32 - r));
}

inline uint64_t rotl64 ( uint64_t x, int8_t r )
{
    return (x << r) | (x >> (64 - r));
}

#define ROTL32(x,y)rotl32(x,y)
#define ROTL64(x,y)rotl64(x,y)

#define BIG_CONSTANT(x) (x##LLU)

//-----------------------------------------------------------------------------
// Block read - if your platform needs to do endian-swapping or can only
// handle aligned reads, do the conversion here

FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
{
    return p[i];
}

FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
{
    return p[i];
}

//-----------------------------------------------------------------------------
// Finalization mix - force all bits of a hash block to avalanche

FORCE_INLINE uint32_t fmix32 ( uint32_t h )
{
    h ^= h >> 16;
    h *= 0x85ebca6b;
    h ^= h >> 13;
    h *= 0xc2b2ae35;
    h ^= h >> 16;

    return h;
}

//----------

FORCE_INLINE uint64_t fmix64 ( uint64_t k )
{
    k ^= k >> 33;
    k *= BIG_CONSTANT(0xff51afd7ed558ccd);
    k ^= k >> 33;
    k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
    k ^= k >> 33;

    return k;
}

//-----------------------------------------------------------------------------

void MurmurHash3_x86_32 ( const void * key, int len,
                          uint32_t seed, void * out )
{
    const uint8_t * data = (const uint8_t*)key;
    const int nblocks = len / 4;

    uint32_t h1 = seed;

    const uint32_t c1 = 0xcc9e2d51;
    const uint32_t c2 = 0x1b873593;

    //----------
    // body

    const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);

    for(int i = -nblocks; i; i++)
	{
	    uint32_t k1 = getblock32(blocks,i);

	    k1 *= c1;
	    k1 = ROTL32(k1,15);
	    k1 *= c2;
    
	    h1 ^= k1;
	    h1 = ROTL32(h1,13); 
	    h1 = h1*5+0xe6546b64;
	}

    //----------
    // tail

    const uint8_t * tail = (const uint8_t*)(data + nblocks*4);

    uint32_t k1 = 0;

    switch(len & 3)
	{
	case 3: k1 ^= tail[2] << 16;
	case 2: k1 ^= tail[1] << 8;
	case 1: k1 ^= tail[0];
	    k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
	};

    //----------
    // finalization

    h1 ^= len;

    h1 = fmix32(h1);

    *(uint32_t*)out = h1;
} 

//-----------------------------------------------------------------------------

void MurmurHash3_x86_128 ( const void * key, const int len,
                           uint32_t seed, void * out )
{
    const uint8_t * data = (const uint8_t*)key;
    const int nblocks = len / 16;

    uint32_t h1 = seed;
    uint32_t h2 = seed;
    uint32_t h3 = seed;
    uint32_t h4 = seed;

    const uint32_t c1 = 0x239b961b; 
    const uint32_t c2 = 0xab0e9789;
    const uint32_t c3 = 0x38b34ae5; 
    const uint32_t c4 = 0xa1e38b93;

    //----------
    // body

    const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);

    for(int i = -nblocks; i; i++)
	{
	    uint32_t k1 = getblock32(blocks,i*4+0);
	    uint32_t k2 = getblock32(blocks,i*4+1);
	    uint32_t k3 = getblock32(blocks,i*4+2);
	    uint32_t k4 = getblock32(blocks,i*4+3);

	    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;

	    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;

	    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;

	    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;

	    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;

	    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;

	    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;

	    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
	}

    //----------
    // tail

    const uint8_t * tail = (const uint8_t*)(data + nblocks*16);

    uint32_t k1 = 0;
    uint32_t k2 = 0;
    uint32_t k3 = 0;
    uint32_t k4 = 0;

    switch(len & 15)
	{
	case 15: k4 ^= tail[14] << 16;
	case 14: k4 ^= tail[13] << 8;
	case 13: k4 ^= tail[12] << 0;
	    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;

	case 12: k3 ^= tail[11] << 24;
	case 11: k3 ^= tail[10] << 16;
	case 10: k3 ^= tail[ 9] << 8;
	case  9: k3 ^= tail[ 8] << 0;
	    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;

	case  8: k2 ^= tail[ 7] << 24;
	case  7: k2 ^= tail[ 6] << 16;
	case  6: k2 ^= tail[ 5] << 8;
	case  5: k2 ^= tail[ 4] << 0;
	    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;

	case  4: k1 ^= tail[ 3] << 24;
	case  3: k1 ^= tail[ 2] << 16;
	case  2: k1 ^= tail[ 1] << 8;
	case  1: k1 ^= tail[ 0] << 0;
	    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
	};

    //----------
    // finalization

    h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;

    h1 += h2; h1 += h3; h1 += h4;
    h2 += h1; h3 += h1; h4 += h1;

    h1 = fmix32(h1);
    h2 = fmix32(h2);
    h3 = fmix32(h3);
    h4 = fmix32(h4);

    h1 += h2; h1 += h3; h1 += h4;
    h2 += h1; h3 += h1; h4 += h1;

    ((uint32_t*)out)[0] = h1;
    ((uint32_t*)out)[1] = h2;
    ((uint32_t*)out)[2] = h3;
    ((uint32_t*)out)[3] = h4;
}

//-----------------------------------------------------------------------------

void MurmurHash3_x64_128 ( const void * key, const int len,
                           const uint32_t seed, void * out )
{
    const uint8_t * data = (const uint8_t*)key;
    const int nblocks = len / 16;

    uint64_t h1 = seed;
    uint64_t h2 = seed;

    const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
    const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);

    //----------
    // body

    const uint64_t * blocks = (const uint64_t *)(data);

    for(int i = 0; i < nblocks; i++)
	{
	    uint64_t k1 = getblock64(blocks,i*2+0);
	    uint64_t k2 = getblock64(blocks,i*2+1);

	    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;

	    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;

	    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;

	    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
	}

    //----------
    // tail

    const uint8_t * tail = (const uint8_t*)(data + nblocks*16);

    uint64_t k1 = 0;
    uint64_t k2 = 0;

    switch(len & 15)
	{
	case 15: k2 ^= ((uint64_t)tail[14]) << 48;
	case 14: k2 ^= ((uint64_t)tail[13]) << 40;
	case 13: k2 ^= ((uint64_t)tail[12]) << 32;
	case 12: k2 ^= ((uint64_t)tail[11]) << 24;
	case 11: k2 ^= ((uint64_t)tail[10]) << 16;
	case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
	case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
	    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;

	case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
	case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
	case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
	case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
	case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
	case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
	case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
	case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
	    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
	};

    //----------
    // finalization

    h1 ^= len; h2 ^= len;

    h1 += h2;
    h2 += h1;

    h1 = fmix64(h1);
    h2 = fmix64(h2);

    h1 += h2;
    h2 += h1;

    ((uint64_t*)out)[0] = h1;
    ((uint64_t*)out)[1] = h2;
}

#endif // _MURMURHASH3_H_


================================================
FILE: bench/bench.hpp
================================================
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>

#include <vector>
#include <fstream>
#include <iostream>
#include <utility>
#include <algorithm>
#include <random>
#include <climits>
#include <cstdlib>
#include <unordered_set>
#include <map>

namespace bench {

static const uint64_t kNumIntRecords = 100000000;
static const uint64_t kNumEmailRecords = 25000000;
static const uint64_t kNumTxns = 10000000;
static const uint64_t kIntRangeSize = 92233697311;
static const uint64_t kEmailRangeSize = 128;

//static const uint64_t kRandintRangeSize = 328 * 1024 * 1024 * (uint64_t)1024;
//static const char* kWordloadDir = "workloads/";

// for pretty print
static const char* kGreen ="\033[0;32m";
static const char* kRed ="\033[0;31m";
static const char* kNoColor ="\033[0;0m";

// for time measurement
double getNow() {
  struct timeval tv;
  gettimeofday(&tv, 0);
  return tv.tv_sec + tv.tv_usec / 1000000.0;
}

std::string uint64ToString(uint64_t key) {
    uint64_t endian_swapped_key = __builtin_bswap64(key);
    return std::string(reinterpret_cast<const char*>(&endian_swapped_key), 8);
}

uint64_t stringToUint64(std::string str_key) {
    uint64_t int_key = 0;
    memcpy(reinterpret_cast<char*>(&int_key), str_key.data(), 8);
    return __builtin_bswap64(int_key);
}

void loadKeysFromFile(const std::string& file_name, const bool is_key_int, 
		      std::vector<std::string> &keys) {
    std::ifstream infile(file_name);
    std::string key;
    uint64_t count = 0;
    if (is_key_int) {
	while (count < kNumIntRecords && infile.good()) {
	    uint64_t int_key;
	    infile >> int_key;
	    key = uint64ToString(int_key);
	    keys.push_back(key);
	    count++;
	}
    } else {
	while (count < kNumEmailRecords && infile.good()) {
	    infile >> key;
	    keys.push_back(key);
	    count++;
	}
    }
}

void loadKeysFromFile(const std::string& file_name, uint64_t num_records,
		      std::vector<uint64_t> &keys) {
    std::ifstream infile(file_name);
    uint64_t count = 0;
    while (count < num_records && infile.good()) {
	uint64_t key;
	infile >> key;
	keys.push_back(key);
	count++;
    }
}

// 0 < percent <= 100
void selectKeysToInsert(const unsigned percent, 
			std::vector<std::string> &insert_keys, 
			std::vector<std::string> &keys) {
    random_shuffle(keys.begin(), keys.end());
    uint64_t num_insert_keys = keys.size() * percent / 100;
    for (uint64_t i = 0; i < num_insert_keys; i++)
	insert_keys.push_back(keys[i]);

    keys.clear();
    sort(insert_keys.begin(), insert_keys.end());
}

// 0 < percent <= 100
void selectIntKeysToInsert(const unsigned percent, 
			   std::vector<uint64_t> &insert_keys, 
			   std::vector<uint64_t> &keys) {
    random_shuffle(keys.begin(), keys.end());
    uint64_t num_insert_keys = keys.size() * percent / 100;
    for (uint64_t i = 0; i < num_insert_keys; i++)
	insert_keys.push_back(keys[i]);

    keys.clear();
    sort(insert_keys.begin(), insert_keys.end());
}

// pos > 0, position counting from the last byte
void modifyKeyByte(std::vector<std::string> &keys, int pos) {
    for (int i = 0; i < (int)keys.size(); i++) {
	int keylen = keys[i].length();
	if (keylen > pos)
	    keys[i][keylen - 1 - pos] = '+';
	else
	    keys[i][0] = '+';
    } 
}

std::string getUpperBoundKey(const std::string& key_type, const std::string& key) {
    std::string ret_str = key;
    if (key_type.compare(std::string("email")) == 0) {
	ret_str[ret_str.size() - 1] += (char)kEmailRangeSize;
    } else {
	uint64_t int_key = stringToUint64(key);
	int_key += kIntRangeSize;
	ret_str = uint64ToString(int_key);
    }
    return ret_str;
}

} // namespace bench


================================================
FILE: bench/bloom.hpp
================================================
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

// Modified by Huanchen, 2018

#ifndef LEVELDB_BLOOM_H_
#define LEVELDB_BLOOM_H_

#include <stdint.h>
#include <string.h>

#include <vector>
#include <string>

#include "MurmurHash3.h"

using namespace std;

inline uint32_t DecodeFixed32(const char* ptr) {
    uint32_t result;
    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
    return result;
}

/*
inline uint32_t Hash(const char* data, size_t n, uint32_t seed) {
    // Similar to murmur hash
    const uint32_t m = 0xc6a4a793;
    const uint32_t r = 24;
    const char* limit = data + n;
    uint32_t h = seed ^ (n * m);

    // Pick up four bytes at a time
    while (data + 4 <= limit) {
	uint32_t w = DecodeFixed32(data);
	data += 4;
	h += w;
	h *= m;
	h ^= (h >> 16);
    }

    // Pick up remaining bytes
    switch (limit - data) {
    case 3:
	h += static_cast<unsigned char>(data[2]) << 16;
    case 2:
	h += static_cast<unsigned char>(data[1]) << 8;
    case 1:
	h += static_cast<unsigned char>(data[0]);
	h *= m;
	h ^= (h >> r);
	break;
    }
    return h;
}
*/
static void BloomHash(const string &key, uint32_t* out) {
    MurmurHash3_x86_128(key.c_str(), key.size(), 0xbc9f1d34, out);
}

static void BloomHash(const uint64_t key, uint32_t* out) {
    MurmurHash3_x86_128((const char*)(&key), sizeof(uint64_t), 0xbc9f1d34, out);
}

class BloomFilter {
 private:
    size_t bits_per_key_;
    size_t k_;

 public:
 BloomFilter(int bits_per_key)
     : bits_per_key_(bits_per_key) {
	// We intentionally round down to reduce probing cost a little bit
	k_ = static_cast<size_t>(bits_per_key * 0.69);  // 0.69 =~ ln(2)
	if (k_ < 1) k_ = 1;
	if (k_ > 30) k_ = 30;
    }

    void CreateFilter(vector<string> keys, int n, string* dst) const {
	// Compute bloom filter size (in both bits and bytes)
	size_t bits = n * bits_per_key_;

	// For small n, we can see a very high false positive rate.  Fix it
	// by enforcing a minimum bloom filter length.
	if (bits < 64) bits = 64;

	size_t bytes = (bits + 7) / 8;
	bits = bytes * 8;

	const size_t init_size = dst->size();
	dst->resize(init_size + bytes, 0);
	dst->push_back(static_cast<char>(k_));  // Remember # of probes in filter
	char* array = &(*dst)[init_size];
	for (int i = 0; i < n; i++) {
	    // Use double-hashing to generate a sequence of hash values.
	    // See analysis in [Kirsch,Mitzenmacher 2006].
	    // uint32_t h = BloomHash(keys[i]);
	    uint32_t hbase[4];
	    BloomHash(keys[i], hbase);
	    uint32_t h = hbase[0];
	    const uint32_t delta = hbase[1];
	    for (size_t j = 0; j < k_; j++) {
		const uint32_t bitpos = h % bits;
		array[bitpos/8] |= (1 << (bitpos % 8));
		h += delta;
	    }
	}
    }

    void CreateFilter(vector<uint64_t> keys, int n, string* dst) const {
	// Compute bloom filter size (in both bits and bytes)
	size_t bits = n * bits_per_key_;

	// For small n, we can see a very high false positive rate.  Fix it
	// by enforcing a minimum bloom filter length.
	if (bits < 64) bits = 64;

	size_t bytes = (bits + 7) / 8;
	bits = bytes * 8;

	const size_t init_size = dst->size();
	dst->resize(init_size + bytes, 0);
	dst->push_back(static_cast<char>(k_));  // Remember # of probes in filter
	char* array = &(*dst)[init_size];
	for (int i = 0; i < n; i++) {
	    // Use double-hashing to generate a sequence of hash values.
	    // See analysis in [Kirsch,Mitzenmacher 2006].
	    //uint32_t h = BloomHash(keys[i]);
	    uint32_t hbase[4];
	    BloomHash(keys[i], hbase);
	    uint32_t h = hbase[0];
	    const uint32_t delta = hbase[1];
	    for (size_t j = 0; j < k_; j++) {
		const uint32_t bitpos = h % bits;
		array[bitpos/8] |= (1 << (bitpos % 8));
		h += delta;
	    }
	}
    }

    bool KeyMayMatch(const string& key, const string& bloom_filter) const {
	const size_t len = bloom_filter.size();
	if (len < 2) return false;

	const char* array = bloom_filter.c_str();
	const size_t bits = (len - 1) * 8;

	// Use the encoded k so that we can read filters generated by
	// bloom filters created using different parameters.
	const size_t k = array[len-1];
	if (k > 30) {
	    // Reserved for potentially new encodings for short bloom filters.
	    // Consider it a match.
	    return true;
	}

	uint32_t hbase[4];
	BloomHash(key, hbase);
	uint32_t h = hbase[0];
	const uint32_t delta = hbase[1];
	for (size_t j = 0; j < k; j++) {
	    const uint32_t bitpos = h % bits;
	    if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
	    h += delta;
	}
	return true;
    }

    bool KeyMayMatch(const uint64_t key, const string& bloom_filter) const {
	const size_t len = bloom_filter.size();
	if (len < 2) return false;

	const char* array = bloom_filter.c_str();
	const size_t bits = (len - 1) * 8;

	// Use the encoded k so that we can read filters generated by
	// bloom filters created using different parameters.
	const size_t k = array[len-1];
	if (k > 30) {
	    // Reserved for potentially new encodings for short bloom filters.
	    // Consider it a match.
	    return true;
	}

	uint32_t hbase[4];
	BloomHash(key, hbase);
	uint32_t h = hbase[0];
	const uint32_t delta = hbase[1];
	for (size_t j = 0; j < k; j++) {
	    const uint32_t bitpos = h % bits;
	    if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
	    h += delta;
	}
	return true;
    }
};


#endif  // LEVELDB_BLOOM_H_


================================================
FILE: bench/filter.hpp
================================================
#ifndef FILTER_H_
#define FILTER_H_

#include <string>
#include <vector>

namespace bench {

class Filter {
public:
    virtual bool lookup(const std::string& key) = 0;
    virtual bool lookupRange(const std::string& left_key, const std::string& right_key) = 0;
    virtual bool approxCount(const std::string& left_key, const std::string& right_key) = 0;
    virtual uint64_t getMemoryUsage() = 0;
};

} // namespace bench

#endif // FILTER_H


================================================
FILE: bench/filter_bloom.hpp
================================================
#ifndef FILTER_BLOOM_H_
#define FILTER_BLOOM_H_

#include <string>
#include <vector>

#include "bloom.hpp"

namespace bench {

class FilterBloom : public Filter {
public:
    // Requires that keys are sorted
    FilterBloom(const std::vector<std::string>& keys) {
	filter_ = new BloomFilter(kBitsPerKey);
	filter_->CreateFilter(keys, keys.size(), &filter_data_);
    }

    ~FilterBloom() {
	delete filter_;
    }

    bool lookup(const std::string& key) {
	return filter_->KeyMayMatch(key, filter_data_);
    }

    bool lookupRange(const std::string& left_key, const std::string& right_key) {
	std::cout << kRed << "A Bloom filter does not support range queries\n" << kNoColor;
	return false;
    }

    bool approxCount(const std::string& left_key, const std::string& right_key) {
	std::cout << kRed << "A Bloom filter does not support approximate count queries\n" << kNoColor;
	return false;
    }

    uint64_t getMemoryUsage() {
	return filter_data_.size();
    }

private:
    int kBitsPerKey = 10;

    BloomFilter* filter_;
    std::string filter_data_;
};

} // namespace bench

#endif // FILTER_BLOOM_H


================================================
FILE: bench/filter_factory.hpp
================================================
#ifndef FILTER_FACTORY_H_
#define FILTER_FACTORY_H_

#include "filter.hpp"
#include "filter_bloom.hpp"
#include "filter_surf.hpp"

namespace bench {

class FilterFactory {
public:
    static Filter* createFilter(const std::string& filter_type,
				const uint32_t suffix_len,
				const std::vector<std::string>& keys) {
	if (filter_type.compare(std::string("SuRF")) == 0)
	    return new FilterSuRF(keys, surf::kNone, 0, 0);
	else if (filter_type.compare(std::string("SuRFHash")) == 0)
	    return new FilterSuRF(keys, surf::kHash, suffix_len, 0);
	else if (filter_type.compare(std::string("SuRFReal")) == 0)
	    return new FilterSuRF(keys, surf::kReal, 0, suffix_len);
        else if (filter_type.compare(std::string("SuRFMixed")) == 0)
	    return new FilterSuRF(keys, surf::kMixed, suffix_len, suffix_len);
	else if (filter_type.compare(std::string("Bloom")) == 0)
	    return new FilterBloom(keys);
	else
	    return new FilterSuRF(keys, surf::kReal, 0, suffix_len); // default
    }
};

} // namespace bench

#endif // FILTER_FACTORY_H


================================================
FILE: bench/filter_surf.hpp
================================================
#ifndef FILTER_SURF_H_
#define FILTER_SURF_H_

#include <string>
#include <vector>

#include "surf.hpp"

namespace bench {

class FilterSuRF : public Filter {
public:
    // Requires that keys are sorted
    FilterSuRF(const std::vector<std::string>& keys,
	       const surf::SuffixType suffix_type,
               const uint32_t hash_suffix_len, const uint32_t real_suffix_len) {
	// uses default sparse-dense size ratio
	filter_ = new surf::SuRF(keys, surf::kIncludeDense, surf::kSparseDenseRatio,
				 suffix_type, hash_suffix_len, real_suffix_len);
    }

    ~FilterSuRF() {
	filter_->destroy();
	delete filter_;
    }

    bool lookup(const std::string& key) {
	return filter_->lookupKey(key);
    }

    bool lookupRange(const std::string& left_key, const std::string& right_key) {
	//return filter_->lookupRange(left_key, false, right_key, false);
	return filter_->lookupRange(left_key, true, right_key, true);
    }

    bool approxCount(const std::string& left_key, const std::string& right_key) {
	return filter_->approxCount(left_key, right_key);
    }

    uint64_t getMemoryUsage() {
	return filter_->getMemoryUsage();
    }

private:
    surf::SuRF* filter_;
};

} // namespace bench

#endif // FILTER_SURF_H


================================================
FILE: bench/run.sh
================================================
#!bin/bash

echo 'Bloom Filter, random int, point queries'
../build/bench/workload Bloom 1 mixed 50 0 randint point zipfian

echo 'SuRF, random int, point queries'
../build/bench/workload SuRF 1 mixed 50 0 randint point zipfian

echo 'SuRFHash, 4-bit suffixes, random int, point queries'
../build/bench/workload SuRFHash 4 mixed 50 0 randint point zipfian

echo 'SuRFReal, 4-bit suffixes, random int, point queries'
../build/bench/workload SuRFReal 4 mixed 50 0 randint point zipfian

echo 'SuRFMixed, 2-bit hash suffixes and 2-bit real suffixes, random int, point queries'
../build/bench/workload SuRFMixed 2 mixed 50 0 randint mix zipfian


# echo 'Bloom Filter, email, point queries'
# ../build/bench/workload Bloom 1 mixed 50 0 email point zipfian

# echo 'SuRF, email, point queries'
# ../build/bench/workload SuRF 1 mixed 50 0 email point zipfian

# echo 'SuRFHash, 4-bit suffixes, email, point queries'
# ../build/bench/workload SuRFHash 4 mixed 50 0 email point zipfian

# echo 'SuRFReal, 4-bit suffixes, email, point queries'
# ../build/bench/workload SuRFReal 4 mixed 50 0 email point zipfian

# echo 'SuRFMixed, 2-bit hash suffixes and 2-bit real suffixes, email, point queries'
# ../build/bench/workload SuRFMixed 2 mixed 50 0 email mix zipfian


echo 'SuRFReal, 4-bit suffixes, random int, range queries'
../build/bench/workload SuRFReal 4 mixed 50 0 randint range zipfian

# echo 'SuRFReal, 4-bit suffixes, email, point queries'
# ../build/bench/workload SuRFReal 4 mixed 50 0 email range zipfian



================================================
FILE: bench/workload.cpp
================================================
#include "bench.hpp"
#include "filter_factory.hpp"

int main(int argc, char *argv[]) {
    if (argc != 9) {
	std::cout << "Usage:\n";
	std::cout << "1. filter type: SuRF, SuRFHash, SuRFReal, SuRFMixed, Bloom\n";
	std::cout << "2. suffix length: 0 < len <= 64 (for SuRFHash and SuRFReal only)\n";
	std::cout << "3. workload type: mixed, alterByte (only for email key)\n";
	std::cout << "4. percentage of keys inserted: 0 < num <= 100\n";
	std::cout << "5. byte position (conting from last, only for alterByte): num\n";
	std::cout << "6. key type: randint, email\n";
	std::cout << "7. query type: point, range, mix, count-long, count-short\n";
	std::cout << "8. distribution: uniform, zipfian, latest\n";
	return -1;
    }

    std::string filter_type = argv[1];
    uint32_t suffix_len = (uint32_t)atoi(argv[2]);
    std::string workload_type = argv[3];
    unsigned percent = atoi(argv[4]);
    unsigned byte_pos = atoi(argv[5]);
    std::string key_type = argv[6];
    std::string query_type = argv[7];
    std::string distribution = argv[8];

    // check args ====================================================
    if (filter_type.compare(std::string("SuRF")) != 0
	&& filter_type.compare(std::string("SuRFHash")) != 0
	&& filter_type.compare(std::string("SuRFReal")) != 0
	&& filter_type.compare(std::string("SuRFMixed")) != 0
	&& filter_type.compare(std::string("Bloom")) != 0
	&& filter_type.compare(std::string("ARF")) != 0) {
	std::cout << bench::kRed << "WRONG filter type\n" << bench::kNoColor;
	return -1;
    }

    if (suffix_len == 0 || suffix_len > 64) {
	std::cout << bench::kRed << "WRONG suffix length\n" << bench::kNoColor;
	return -1;
    }

    if (workload_type.compare(std::string("mixed")) != 0
	&& workload_type.compare(std::string("alterByte")) == 0) {
	std::cout << bench::kRed << "WRONG workload type\n" << bench::kNoColor;
	return -1;
    }

    if (percent > 100) {
	std::cout << bench::kRed << "WRONG percentage\n" << bench::kNoColor;
	return -1;
    }

    if (key_type.compare(std::string("randint")) != 0
	&& key_type.compare(std::string("timestamp")) != 0
	&& key_type.compare(std::string("email")) != 0) {
	std::cout << bench::kRed << "WRONG key type\n" << bench::kNoColor;
	return -1;
    }

    if (query_type.compare(std::string("point")) != 0
	&& query_type.compare(std::string("range")) != 0
	&& query_type.compare(std::string("mix")) != 0
	&& query_type.compare(std::string("count-long")) != 0
	&& query_type.compare(std::string("count-short")) != 0) {
	std::cout << bench::kRed << "WRONG query type\n" << bench::kNoColor;
	return -1;
    }

    if (distribution.compare(std::string("uniform")) != 0
	&& distribution.compare(std::string("zipfian")) != 0
	&& distribution.compare(std::string("latest")) != 0) {
	std::cout << bench::kRed << "WRONG distribution\n" << bench::kNoColor;
	return -1;
    }

    // load keys from files =======================================
    std::string load_file = "workloads/load_";
    load_file += key_type;
    std::vector<std::string> load_keys;
    if (key_type.compare(std::string("email")) == 0)
	bench::loadKeysFromFile(load_file, false, load_keys);
    else
	bench::loadKeysFromFile(load_file, true, load_keys);

    std::string txn_file = "workloads/txn_";
    txn_file += key_type;
    txn_file += "_";
    txn_file += distribution;
    std::vector<std::string> txn_keys;
    if (key_type.compare(std::string("email")) == 0)
	bench::loadKeysFromFile(txn_file, false, txn_keys);
    else
	bench::loadKeysFromFile(txn_file, true, txn_keys);

    std::vector<std::string> insert_keys;
    bench::selectKeysToInsert(percent, insert_keys, load_keys);

    if (workload_type.compare(std::string("alterByte")) == 0)
	bench::modifyKeyByte(txn_keys, byte_pos);

    //compute keys for approximate count-long queries =================
    std::vector<std::string> left_keys, right_keys;
    if (query_type.compare(std::string("count-long")) == 0) {
    	for (int i = 0; i < (int)txn_keys.size() - 1; i++) {
    	    if (txn_keys[i].compare(txn_keys[i + 1]) < 0) {
    		left_keys.push_back(txn_keys[i]);
    		right_keys.push_back(txn_keys[i + 1]);
    	    } else {
    		left_keys.push_back(txn_keys[i + 1]);
    		right_keys.push_back(txn_keys[i]);
    	    }
    	}
    }
    
    // create filter ==============================================
    double time1 = bench::getNow();
    bench::Filter* filter = bench::FilterFactory::createFilter(filter_type, suffix_len, insert_keys);
    double time2 = bench::getNow();
    std::cout << "Build time = " << (time2 - time1) << std::endl;

    // execute transactions =======================================
    int64_t positives = 0;
    uint64_t count = 0;
    double start_time = bench::getNow();

    if (query_type.compare(std::string("point")) == 0) {
	for (int i = 0; i < (int)txn_keys.size(); i++)
	    positives += (int)filter->lookup(txn_keys[i]);
    } else if (query_type.compare(std::string("range")) == 0) {
	for (int i = 0; i < (int)txn_keys.size(); i++)
	    if (key_type.compare(std::string("email")) == 0) {
		std::string ret_str = txn_keys[i];
		ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize;
		positives += (int)filter->lookupRange(txn_keys[i], ret_str);
	    } else {
		positives += (int)filter->lookupRange(txn_keys[i], bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize));
	    }
    } else if (query_type.compare(std::string("mix")) == 0) {
	for (int i = 0; i < (int)txn_keys.size(); i++) {
	    if (i % 2 == 0) {
		positives += (int)filter->lookup(txn_keys[i]);
	    } else {
		if (key_type.compare(std::string("email")) == 0) {
		    std::string ret_str = txn_keys[i];
		    ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize;
		    positives += (int)filter->lookupRange(txn_keys[i], ret_str);
		} else {
		    positives += (int)filter->lookupRange(txn_keys[i], bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize));
		}
	    }
	}
    } else if (query_type.compare(std::string("count-long")) == 0) {
	for (int i = 0; i < (int)txn_keys.size() - 1; i++)
	    count += filter->approxCount(left_keys[i], right_keys[i]);
    } else if (query_type.compare(std::string("count-short")) == 0) {
	for (int i = 0; i < (int)txn_keys.size(); i++)
	    if (key_type.compare(std::string("email")) == 0) {
		std::string ret_str = txn_keys[i];
		ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize;
		count += filter->approxCount(txn_keys[i], ret_str);
	    } else {
		count += filter->approxCount(txn_keys[i], bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize));
	    }
    }

    double end_time = bench::getNow();

    // compute true positives ======================================
    std::map<std::string, bool> ht;
    for (int i = 0; i < (int)insert_keys.size(); i++)
	ht[insert_keys[i]] = true;

    int64_t true_positives = 0;
    std::map<std::string, bool>::iterator ht_iter;
    if (query_type.compare(std::string("point")) == 0) {
	for (int i = 0; i < (int)txn_keys.size(); i++) {
	    ht_iter = ht.find(txn_keys[i]);
	    true_positives += (ht_iter != ht.end());
	}
    } else if (query_type.compare(std::string("range")) == 0) {
	for (int i = 0; i < (int)txn_keys.size(); i++) {
	    ht_iter = ht.lower_bound(txn_keys[i]);
	    if (ht_iter != ht.end()) {
		std::string fetched_key = ht_iter->first;
		if (key_type.compare(std::string("email")) == 0) {
		    std::string ret_str = txn_keys[i];
		    ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize;
		    true_positives += (fetched_key.compare(ret_str) < 0);
		} else {
		    true_positives += (fetched_key.compare(bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize)) < 0);
		}
	    }
	}
    } else if (query_type.compare(std::string("mix")) == 0) {
	for (int i = 0; i < (int)txn_keys.size(); i++) {
	    if (i % 2 == 0) {
		ht_iter = ht.find(txn_keys[i]);
		true_positives += (ht_iter != ht.end());
	    } else {
		ht_iter = ht.lower_bound(txn_keys[i]);
		if (ht_iter != ht.end()) {
		    std::string fetched_key = ht_iter->first;
		    if (key_type.compare(std::string("email")) == 0) {
			std::string ret_str = txn_keys[i];
			ret_str[ret_str.size() - 1] += (char)bench::kEmailRangeSize;
			true_positives += (fetched_key.compare(ret_str) < 0);
		    } else {
			true_positives += (fetched_key.compare(bench::uint64ToString(bench::stringToUint64(txn_keys[i]) + bench::kIntRangeSize)) < 0);
		    }
		}	
	    }
	}
    }
    int64_t false_positives = positives - true_positives;
    assert(false_positives >= 0);
    int64_t true_negatives = txn_keys.size() - positives;

    // print
    double tput = txn_keys.size() / (end_time - start_time) / 1000000; // Mops/sec
    std::cout << bench::kGreen << "Throughput = " << bench::kNoColor << tput << "\n";

    std::cout << "positives = " << positives << "\n";
    std::cout << "true positives = " << true_positives << "\n";
    std::cout << "false positives = " << false_positives << "\n";
    std::cout << "true negatives = " << true_negatives << "\n";
    std::cout << "count = " << count << "\n";

    double fp_rate = 0;
    if (false_positives > 0)
	fp_rate = false_positives / (true_negatives + false_positives + 0.0);
    std::cout << bench::kGreen << "False Positive Rate = " << bench::kNoColor << fp_rate << "\n";

    std::cout << bench::kGreen << "Memory = " << bench::kNoColor << filter->getMemoryUsage() << "\n\n";

    return 0;
}


================================================
FILE: bench/workload_arf.cpp
================================================
#include "bench.hpp"
#include "ARF.h"
#include "Database.h"
#include "Query.h"

static const int kARFSize = 70000000;
static const int kInputSize = 10000000;
static const int kTxnSize = 10000000;
static const int kTrainingSize = 2000000;
static const uint64_t kDomain = (ULLONG_MAX / 2 - 1);
static const uint64_t kRangeSize = 922336973116;

int main(int argc, char *argv[]) {
    if (argc != 4) {
	std::cout << "Usage:\n";
	std::cout << "1. percentage of keys inserted: 0 < num <= 100\n";
	std::cout << "2. query type: point, range\n";
	std::cout << "3. distribution: uniform, zipfian, latest\n";
	return -1;
    }

    unsigned percent = atoi(argv[1]);
    std::string query_type = argv[2];
    std::string distribution = argv[3];

    // check args ====================================================
    if (percent > 100) {
	std::cout << bench::kRed << "WRONG percentage\n" << bench::kNoColor;
	return -1;
    }

    if (query_type.compare(std::string("point")) != 0
	&& query_type.compare(std::string("range")) != 0) {
	std::cout << bench::kRed << "WRONG query type\n" << bench::kNoColor;
	return -1;
    }

    if (distribution.compare(std::string("uniform")) != 0
	&& distribution.compare(std::string("zipfian")) != 0
	&& distribution.compare(std::string("latest")) != 0) {
	std::cout << bench::kRed << "WRONG distribution\n" << bench::kNoColor;
	return -1;
    }

    // load keys from files =======================================
    std::string load_file = "workloads/load_randint";
    std::vector<uint64_t> load_keys;
    bench::loadKeysFromFile(load_file, kInputSize, load_keys);
    std::cout << "load_keys size = " << load_keys.size() << "\n";

    sort(load_keys.begin(), load_keys.end());
    uint64_t max_key = load_keys[load_keys.size() - 1];
    std::cout << std::hex << "max key = " << max_key << std::dec << "\n";
    uint64_t max_gap = load_keys[load_keys.size() - 1] - load_keys[0];
    std::cout << "max gap = " << max_gap << "\n";
    uint64_t avg_gap = max_gap / kInputSize;
    std::cout << "avg gap = " << avg_gap << "\n";

    std::string txn_file = "workloads/txn_randint_";
    txn_file += distribution;
    std::vector<uint64_t> txn_keys;
    bench::loadKeysFromFile(txn_file, kTxnSize, txn_keys);
    std::cout << "txn_keys size = " << txn_keys.size() << "\n";

    std::vector<uint64_t> insert_keys;
    bench::selectIntKeysToInsert(percent, insert_keys, load_keys);
    std::cout << "insert_keys size = " << insert_keys.size() << "\n";

    // compute upperbound keys for range queries =================
    std::vector<uint64_t> upper_bound_keys;
    if (query_type.compare(std::string("range")) == 0) {
	for (int i = 0; i < (int)txn_keys.size(); i++) {
	    txn_keys[i]++;
	    uint64_t upper_bound = txn_keys[i] + kRangeSize;
	    upper_bound_keys.push_back(upper_bound);
	}
    } else {
	for (int i = 0; i < (int)txn_keys.size(); i++) {
	    upper_bound_keys.push_back(txn_keys[i]);
	}
    }

    // create filter ==============================================
    arf::Database* db = new arf::Database(insert_keys);
    arf::ARF* filter = new arf::ARF(0, kDomain, db);

    // build perfect ARF ==========================================
    double start_time = bench::getNow();
    filter->perfect(db);
    double end_time = bench::getNow();
    double time_diff = end_time - start_time;
    std::cout << "build perfect time = " << time_diff << " s\n";

    // training ===================================================
    start_time = bench::getNow();
    for (int i = 0; i < kTrainingSize; i++) {
	if (i % 100000 == 0)
	    std::cout << "i = " << i << std::endl;
	bool qR = db->rangeQuery(txn_keys[i], upper_bound_keys[i]);
	filter->handle_query(txn_keys[i], upper_bound_keys[i], qR, true);
    }
    filter->reset_training_phase();
    filter->truncate(kARFSize);
    filter->end_training_phase();
    filter->print_size();
    end_time = bench::getNow();
    time_diff = end_time - start_time;
    std::cout << "training time = " << time_diff << " s\n";
    std::cout << "training throughput = " << ((kTrainingSize + 0.0) / time_diff) << " txns/s\n";

    // execute transactions =======================================
    int64_t positives = 0;
    start_time = bench::getNow();
    for (int i = kTrainingSize; i < kTxnSize; i++) {
	positives += (int)filter->handle_query(txn_keys[i], upper_bound_keys[i], true, false);
    }
    end_time = bench::getNow();
    time_diff = end_time - start_time;
    std::cout << "time = " << time_diff << " s\n";
    std::cout << "throughput = " << bench::kGreen << ((kTrainingSize + 0.0) / time_diff) << bench::kNoColor << " txns/s\n";
    end_time = bench::getNow();

    // compute true positives ======================================
    int64_t tps = 0;
    int64_t tns = 0;
    for (int i = kTrainingSize; i < kTxnSize; i++) {
	bool dR = db->rangeQuery(txn_keys[i], upper_bound_keys[i]);
	if (dR)
	    tps++;
	else
	    tns++;
    }
    int64_t fps = positives - tps;

    std::cout << "positives = " << positives << "\n";
    std::cout << "true positives = " << tps << "\n";
    std::cout << "true negatives = " << tns << "\n";
    std::cout << "false positives = " << fps << "\n";

    double fp_rate = 0;
    if (fps >= 0)
	fp_rate = fps / (tns + fps + 0.0);
    else
	std::cout << "ERROR: fps < 0\n";
    std::cout << "False Positive Rate = " << fp_rate << "\n";

    return 0;
}


================================================
FILE: bench/workload_gen/gen_load.py
================================================
import sys
import os

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

#####################################################################################

def reverseHostName ( email ) :
    name, sep, host = email.partition('@')
    hostparts = host[:-1].split('.')
    r_host = ''
    for part in hostparts :
        r_host = part + '.' + r_host
    return r_host + sep + name

#####################################################################################

if (len(sys.argv) < 3) :
    print bcolors.FAIL + 'Usage:'
    print 'arg 1, key type: randint, timestamp, email' 
    print 'arg 2, distribution: uniform, zipfian, latest' + bcolors.ENDC
    sys.exit()

key_type = sys.argv[1]
distribution = sys.argv[2]

print bcolors.OKGREEN + 'key type = ' + key_type 
print 'distribution = ' + distribution + bcolors.ENDC

ycsb_dir = 'YCSB/bin/'
workload_dir = 'workload_spec/'
output_dir='../workloads/'

email_list = 'email_list.txt'
email_list_size = 27549660
email_keymap_file = output_dir + 'email_keymap.txt'

timestamp_list = 'poisson_timestamps.csv'
timestamp_keymap_file = output_dir + 'timestamp_keymap.txt'

if key_type != 'randint' and key_type != 'timestamp' and key_type != 'email' :
    print bcolors.FAIL + 'Incorrect key_type: please pick from randint and email' + bcolors.ENDC
    sys.exit()

if distribution != 'uniform' and distribution != 'zipfian' and distribution != 'latest' :
    print bcolors.FAIL + 'Incorrect distribution: please pick from uniform, zipfian and latest' + bcolors.ENDC
    sys.exit()

out_ycsb_load = output_dir + 'ycsb_load_' + key_type
out_load_ycsbkey = output_dir + 'load_' + 'ycsbkey'
out_load = output_dir + 'load_' + key_type

cmd_ycsb_load = ycsb_dir + 'ycsb load basic -P ' + workload_dir + 'workloadc_' + key_type + '_' + distribution + ' -s > ' + out_ycsb_load

os.system(cmd_ycsb_load)

#####################################################################################

f_load = open (out_ycsb_load, 'r')
f_load_out = open (out_load_ycsbkey, 'w')
for line in f_load :
    cols = line.split()
    if len(cols) > 2 and cols[0] == "INSERT":
        f_load_out.write (cols[2][4:] + '\n')
f_load.close()
f_load_out.close()

cmd = 'rm -f ' + out_ycsb_load
os.system(cmd)

#####################################################################################

if key_type == 'randint' :
    f_load = open (out_load_ycsbkey, 'r')
    f_load_out = open (out_load, 'w')
    for line in f_load :
        f_load_out.write (line)

elif key_type == 'timestamp' :
    timestamp_keymap = {}
    f_timestamp_keymap = open (timestamp_keymap_file, 'w')

    f_timestamp = open (timestamp_list, 'r')
    timestamps = f_timestamp.readlines()

    f_load_out = open (out_load, 'w')
    f_load = open (out_load_ycsbkey, 'r')
    count = 0
    for line in f_load :
        cols = line.split()
        ts = timestamps[count]
        f_load_out.write (ts)
        f_timestamp_keymap.write (cols[0] + ' ' + ts)
        count += 1
    f_timestamp_keymap.close()

elif key_type == 'email' :
    email_keymap = {}
    f_email_keymap = open (email_keymap_file, 'w')

    f_email = open (email_list, 'r')
    emails = f_email.readlines()

    f_load = open (out_load_ycsbkey, 'r')
    f_load_out = open (out_load, 'w')

    sample_size = len(f_load.readlines())
    gap = email_list_size / sample_size

    f_load.close()
    f_load = open (out_load_ycsbkey, 'r')
    count = 0
    for line in f_load :
        cols = line.split()
        email = reverseHostName(emails[count * gap])
        f_load_out.write (email + '\n')
        f_email_keymap.write (cols[0] + ' ' + email + '\n')
        count += 1
    f_email_keymap.close()

f_load.close()
f_load_out.close()

cmd = 'rm -f ' + out_load_ycsbkey
os.system(cmd)


================================================
FILE: bench/workload_gen/gen_txn.py
================================================
import sys
import os

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

#####################################################################################

def reverseHostName ( email ) :
    name, sep, host = email.partition('@')
    hostparts = host[:-1].split('.')
    r_host = ''
    for part in hostparts :
        r_host = part + '.' + r_host
    return r_host + sep + name

#####################################################################################

if (len(sys.argv) < 3) :
    print bcolors.FAIL + 'Usage:'
    print 'arg 1, key type: randint, timestamp, email' 
    print 'arg 2, distribution: uniform, zipfian, latest' + bcolors.ENDC
    sys.exit()

key_type = sys.argv[1]
distribution = sys.argv[2]

print bcolors.OKGREEN +  'key type = ' + key_type
print 'distribution = ' + distribution + bcolors.ENDC

ycsb_dir = 'YCSB/bin/'
workload_dir = 'workload_spec/'
output_dir='../workloads/'

email_list = 'email_list.txt'
email_list_size = 27549660
email_keymap_file = output_dir + 'email_keymap.txt'

timestamp_list = 'poisson_timestamps.csv'
timestamp_keymap_file = output_dir + 'timestamp_keymap.txt'

if key_type != 'randint' and key_type != 'timestamp' and key_type != 'email' :
    print bcolors.FAIL + 'Incorrect key_type: please pick from randint and email' + bcolors.ENDC
    sys.exit()

if distribution != 'uniform' and distribution != 'zipfian' and distribution != 'latest' :
    print bcolors.FAIL + 'Incorrect distribution: please pick from uniform, zipfian and latest' + bcolors.ENDC
    sys.exit()

out_ycsb_txn = output_dir + 'ycsb_txn_' + key_type + '_' + distribution
out_txn_ycsbkey = output_dir + 'txn_' + 'ycsbkey' + '_' + distribution
out_txn = output_dir + 'txn_' + key_type + '_' + distribution

cmd_ycsb_txn = ycsb_dir + 'ycsb run basic -P ' + workload_dir + 'workloadc_' + key_type + '_' + distribution + ' -s > ' + out_ycsb_txn

os.system(cmd_ycsb_txn)

#####################################################################################

f_txn = open (out_ycsb_txn, 'r')
f_txn_out = open (out_txn_ycsbkey, 'w')
for line in f_txn :
    cols = line.split()
    if len(cols) > 2 and cols[0] == 'READ' :
        f_txn_out.write (cols[2][4:] + "\n")
f_txn.close()
f_txn_out.close()

cmd = 'rm -f ' + out_ycsb_txn
os.system(cmd)

#####################################################################################

if key_type == 'randint' :
    f_txn = open (out_txn_ycsbkey, 'r')
    f_txn_out = open (out_txn, 'w')
    for line in f_txn :
        f_txn_out.write (line)

elif key_type == 'timestamp' :
    timestamp_keymap = {}
    f_timestamp_keymap = open (timestamp_keymap_file, 'r')
    for line in f_timestamp_keymap :
        cols = line.split()
        timestamp_keymap[int(cols[0])] = cols[1]

    count = 0
    f_txn = open (out_txn_ycsbkey, 'r')
    f_txn_out = open (out_txn, 'w')
    for line in f_txn :
        cols = line.split()
        if len(cols) > 0 :
            f_txn_out.write (timestamp_keymap[int(cols[0])] + '\n')
    f_timestamp_keymap.close()

elif key_type == 'email' :
    email_keymap = {}
    f_email_keymap = open (email_keymap_file, 'r')
    for line in f_email_keymap :
        cols = line.split()
        email_keymap[int(cols[0])] = cols[1]

    count = 0
    f_txn = open (out_txn_ycsbkey, 'r')
    f_txn_out = open (out_txn, 'w')
    for line in f_txn :
        cols = line.split()
        if len(cols) > 0 :
            f_txn_out.write (email_keymap[int(cols[0])] + '\n')
    f_email_keymap.close()

f_txn.close()
f_txn_out.close()

cmd = 'rm -f ' + out_txn_ycsbkey
os.system(cmd)


================================================
FILE: bench/workload_gen/gen_workload.sh
================================================
#!bin/bash

python gen_load.py randint uniform
python gen_txn.py randint uniform
python gen_txn.py randint zipfian
#python gen_txn.py randint latest

#python gen_load.py email uniform
#python gen_txn.py email uniform
#python gen_txn.py email zipfian
#python gen_txn.py email latest




================================================
FILE: bench/workload_gen/workload_spec/workload_template
================================================
# Copyright (c) 2012-2016 YCSB contributors. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you
# may not use this file except in compliance with the License. You
# may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied. See the License for the specific language governing
# permissions and limitations under the License. See accompanying
# LICENSE file.

# Yahoo! Cloud System Benchmark
# Workload Template: Default Values
#
# File contains all properties that can be set to define a
# YCSB session. All properties are set to their default
# value if one exists. If not, the property is commented
# out. When a property has a finite number of settings,
# the default is enabled and the alternates are shown in
# comments below it.
# 
# Use of most explained through comments in Client.java or 
# CoreWorkload.java or on the YCSB wiki page:
# https://github.com/brianfrankcooper/YCSB/wiki/Core-Properties

# The name of the workload class to use
workload=com.yahoo.ycsb.workloads.CoreWorkload

# There is no default setting for recordcount but it is
# required to be set.
# The number of records in the table to be inserted in
# the load phase or the number of records already in the 
# table before the run phase.
recordcount=1000000

# There is no default setting for operationcount but it is
# required to be set.
# The number of operations to use during the run phase.
operationcount=3000000

# The number of insertions to do, if different from recordcount.
# Used with insertstart to grow an existing table.
#insertcount=

# The offset of the first insertion
insertstart=0

# The number of fields in a record
fieldcount=10

# The size of each field (in bytes)
fieldlength=100

# Should read all fields
readallfields=true

# Should write all fields on update
writeallfields=false

# The distribution used to choose the length of a field
fieldlengthdistribution=constant
#fieldlengthdistribution=uniform
#fieldlengthdistribution=zipfian

# What proportion of operations are reads
readproportion=0.95

# What proportion of operations are updates
updateproportion=0.05

# What proportion of operations are inserts
insertproportion=0

# What proportion of operations read then modify a record
readmodifywriteproportion=0

# What proportion of operations are scans
scanproportion=0

# On a single scan, the maximum number of records to access
maxscanlength=1000

# The distribution used to choose the number of records to access on a scan
scanlengthdistribution=uniform
#scanlengthdistribution=zipfian

# Should records be inserted in order or pseudo-randomly
insertorder=hashed
#insertorder=ordered

# The distribution of requests across the keyspace
requestdistribution=zipfian
#requestdistribution=uniform
#requestdistribution=latest

# Percentage of data items that constitute the hot set
hotspotdatafraction=0.2

# Percentage of operations that access the hot set
hotspotopnfraction=0.8

# Maximum execution time in seconds
#maxexecutiontime= 

# The name of the database table to run queries against
table=usertable

# The column family of fields (required by some databases)
#columnfamily=

# How the latency measurements are presented
measurementtype=histogram
#measurementtype=timeseries
#measurementtype=raw
# When measurementtype is set to raw, measurements will be output
# as RAW datapoints in the following csv format:
# "operation, timestamp of the measurement, latency in us"
#
# Raw datapoints are collected in-memory while the test is running. Each
# data point consumes about 50 bytes (including java object overhead).
# For a typical run of 1 million to 10 million operations, this should
# fit into memory most of the time. If you plan to do 100s of millions of
# operations per run, consider provisioning a machine with larger RAM when using
# the RAW measurement type, or split the run into multiple runs.
#
# Optionally, you can specify an output file to save raw datapoints.
# Otherwise, raw datapoints will be written to stdout.
# The output file will be appended to if it already exists, otherwise
# a new output file will be created.
#measurement.raw.output_file = /tmp/your_output_file_for_this_run

# JVM Reporting.
#
# Measure JVM information over time including GC counts, max and min memory
# used, max and min thread counts, max and min system load and others. This
# setting must be enabled in conjunction with the "-s" flag to run the status
# thread. Every "status.interval", the status thread will capture JVM 
# statistics and record the results. At the end of the run, max and mins will
# be recorded.
# measurement.trackjvm = false

# The range of latencies to track in the histogram (milliseconds)
histogram.buckets=1000

# Granularity for time series (in milliseconds)
timeseries.granularity=1000

# Latency reporting.
#
# YCSB records latency of failed operations separately from successful ones.
# Latency of all OK operations will be reported under their operation name,
# such as [READ], [UPDATE], etc.
#
# For failed operations:
# By default we don't track latency numbers of specific error status.
# We just report latency of all failed operation under one measurement name
# such as [READ-FAILED]. But optionally, user can configure to have either:
# 1. Record and report latency for each and every error status code by
#    setting reportLatencyForEachError to true, or
# 2. Record and report latency for a select set of error status codes by
#    providing a CSV list of Status codes via the "latencytrackederrors"
#    property.
# reportlatencyforeacherror=false
# latencytrackederrors="<comma separated strings of error codes>"

# Insertion error retry for the core workload.
#
# By default, the YCSB core workload does not retry any operations.
# However, during the load process, if any insertion fails, the entire
# load process is terminated.
# If a user desires to have more robust behavior during this phase, they can
# enable retry for insertion by setting the following property to a positive
# number.
# core_workload_insertion_retry_limit = 0
#
# the following number controls the interval between retries (in seconds):
# core_workload_insertion_retry_interval = 3

# Distributed Tracing via Apache HTrace (http://htrace.incubator.apache.org/)
#
# Defaults to blank / no tracing
# Below sends to a local file, sampling at 0.1%
#
# htrace.sampler.classes=ProbabilitySampler
# htrace.sampler.fraction=0.001
# htrace.span.receiver.classes=org.apache.htrace.core.LocalFileSpanReceiver
# htrace.local.file.span.receiver.path=/some/path/to/local/file
#
# To capture all spans, use the AlwaysSampler
#
# htrace.sampler.classes=AlwaysSampler
#
# To send spans to an HTraced receiver, use the below and ensure
# your classpath contains the htrace-htraced jar (i.e. when invoking the ycsb
# command add -cp /path/to/htrace-htraced.jar)
#
# htrace.span.receiver.classes=org.apache.htrace.impl.HTracedSpanReceiver
# htrace.htraced.receiver.address=example.com:9075
# htrace.htraced.error.log.period.ms=10000


================================================
FILE: bench/workload_gen/workload_spec/workloadc_email_latest
================================================
# Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                                                                             
#                                                                                                                                                                                 
# Licensed under the Apache License, Version 2.0 (the "License"); you                                                                                                             
# may not use this file except in compliance with the License. You                                                                                                                
# may obtain a copy of the License at                                                                                                                                             
#                                                                                                                                                                                 
# http://www.apache.org/licenses/LICENSE-2.0                                                                                                                                      
#                                                                                                                                                                                 
# Unless required by applicable law or agreed to in writing, software                                                                                                             
# distributed under the License is distributed on an "AS IS" BASIS,                                                                                                               
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or                                                                                                                 
# implied. See the License for the specific language governing                                                                                                                    
# permissions and limitations under the License. See accompanying                                                                                                                 
# LICENSE file.                                                                                                                                                                   

# Yahoo! Cloud System Benchmark
# Workload C: Read only
#   Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop)
#                        
#   Read/update ratio: 100/0
#   Default data size: 1 KB records (10 fields, 100 bytes each, plus key)
#   Request distribution: zipfian

recordcount=25000000
operationcount=10000000
workload=com.yahoo.ycsb.workloads.CoreWorkload

readallfields=true

readproportion=1
updateproportion=0
scanproportion=0
insertproportion=0

requestdistribution=latest





================================================
FILE: bench/workload_gen/workload_spec/workloadc_email_uniform
================================================
# Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                                                                             
#                                                                                                                                                                                 
# Licensed under the Apache License, Version 2.0 (the "License"); you                                                                                                             
# may not use this file except in compliance with the License. You                                                                                                                
# may obtain a copy of the License at                                                                                                                                             
#                                                                                                                                                                                 
# http://www.apache.org/licenses/LICENSE-2.0                                                                                                                                      
#                                                                                                                                                                                 
# Unless required by applicable law or agreed to in writing, software                                                                                                             
# distributed under the License is distributed on an "AS IS" BASIS,                                                                                                               
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or                                                                                                                 
# implied. See the License for the specific language governing                                                                                                                    
# permissions and limitations under the License. See accompanying                                                                                                                 
# LICENSE file.                                                                                                                                                                   

# Yahoo! Cloud System Benchmark
# Workload C: Read only
#   Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop)
#                        
#   Read/update ratio: 100/0
#   Default data size: 1 KB records (10 fields, 100 bytes each, plus key)
#   Request distribution: zipfian

recordcount=25000000
operationcount=10000000
workload=com.yahoo.ycsb.workloads.CoreWorkload

fieldcount=1
fieldlength=10
readallfields=true

readproportion=1
updateproportion=0
scanproportion=0
insertproportion=0

requestdistribution=uniform





================================================
FILE: bench/workload_gen/workload_spec/workloadc_email_zipfian
================================================
# Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                                                                             
#                                                                                                                                                                                 
# Licensed under the Apache License, Version 2.0 (the "License"); you                                                                                                             
# may not use this file except in compliance with the License. You                                                                                                                
# may obtain a copy of the License at                                                                                                                                             
#                                                                                                                                                                                 
# http://www.apache.org/licenses/LICENSE-2.0                                                                                                                                      
#                                                                                                                                                                                 
# Unless required by applicable law or agreed to in writing, software                                                                                                             
# distributed under the License is distributed on an "AS IS" BASIS,                                                                                                               
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or                                                                                                                 
# implied. See the License for the specific language governing                                                                                                                    
# permissions and limitations under the License. See accompanying                                                                                                                 
# LICENSE file.                                                                                                                                                                   

# Yahoo! Cloud System Benchmark
# Workload C: Read only
#   Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop)
#                        
#   Read/update ratio: 100/0
#   Default data size: 1 KB records (10 fields, 100 bytes each, plus key)
#   Request distribution: zipfian

recordcount=25000000
operationcount=10000000
workload=com.yahoo.ycsb.workloads.CoreWorkload

readallfields=true

readproportion=1
updateproportion=0
scanproportion=0
insertproportion=0

requestdistribution=zipfian





================================================
FILE: bench/workload_gen/workload_spec/workloadc_randint_latest
================================================
# Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                                                                             
#                                                                                                                                                                                 
# Licensed under the Apache License, Version 2.0 (the "License"); you                                                                                                             
# may not use this file except in compliance with the License. You                                                                                                                
# may obtain a copy of the License at                                                                                                                                             
#                                                                                                                                                                                 
# http://www.apache.org/licenses/LICENSE-2.0                                                                                                                                      
#                                                                                                                                                                                 
# Unless required by applicable law or agreed to in writing, software                                                                                                             
# distributed under the License is distributed on an "AS IS" BASIS,                                                                                                               
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or                                                                                                                 
# implied. See the License for the specific language governing                                                                                                                    
# permissions and limitations under the License. See accompanying                                                                                                                 
# LICENSE file.                                                                                                                                                                   

# Yahoo! Cloud System Benchmark
# Workload C: Read only
#   Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop)
#                        
#   Read/update ratio: 100/0
#   Default data size: 1 KB records (10 fields, 100 bytes each, plus key)
#   Request distribution: zipfian

recordcount=100000000
operationcount=10000000
workload=com.yahoo.ycsb.workloads.CoreWorkload

readallfields=true

readproportion=1
updateproportion=0
scanproportion=0
insertproportion=0

requestdistribution=latest





================================================
FILE: bench/workload_gen/workload_spec/workloadc_randint_uniform
================================================
# Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                                                                             
#                                                                                                                                                                                 
# Licensed under the Apache License, Version 2.0 (the "License"); you                                                                                                             
# may not use this file except in compliance with the License. You                                                                                                                
# may obtain a copy of the License at                                                                                                                                             
#                                                                                                                                                                                 
# http://www.apache.org/licenses/LICENSE-2.0                                                                                                                                      
#                                                                                                                                                                                 
# Unless required by applicable law or agreed to in writing, software                                                                                                             
# distributed under the License is distributed on an "AS IS" BASIS,                                                                                                               
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or                                                                                                                 
# implied. See the License for the specific language governing                                                                                                                    
# permissions and limitations under the License. See accompanying                                                                                                                 
# LICENSE file.                                                                                                                                                                   

# Yahoo! Cloud System Benchmark
# Workload C: Read only
#   Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop)
#                        
#   Read/update ratio: 100/0
#   Default data size: 1 KB records (10 fields, 100 bytes each, plus key)
#   Request distribution: zipfian

recordcount=100000000
operationcount=10000000
workload=com.yahoo.ycsb.workloads.CoreWorkload

fieldcount=1
fieldlength=10
readallfields=true

readproportion=1
updateproportion=0
scanproportion=0
insertproportion=0

requestdistribution=uniform





================================================
FILE: bench/workload_gen/workload_spec/workloadc_randint_zipfian
================================================
# Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                                                                             
#                                                                                                                                                                                 
# Licensed under the Apache License, Version 2.0 (the "License"); you                                                                                                             
# may not use this file except in compliance with the License. You                                                                                                                
# may obtain a copy of the License at                                                                                                                                             
#                                                                                                                                                                                 
# http://www.apache.org/licenses/LICENSE-2.0                                                                                                                                      
#                                                                                                                                                                                 
# Unless required by applicable law or agreed to in writing, software                                                                                                             
# distributed under the License is distributed on an "AS IS" BASIS,                                                                                                               
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or                                                                                                                 
# implied. See the License for the specific language governing                                                                                                                    
# permissions and limitations under the License. See accompanying                                                                                                                 
# LICENSE file.                                                                                                                                                                   

# Yahoo! Cloud System Benchmark
# Workload C: Read only
#   Application example: user profile cache, where profiles are constructed elsewhere (e.g., Hadoop)
#                        
#   Read/update ratio: 100/0
#   Default data size: 1 KB records (10 fields, 100 bytes each, plus key)
#   Request distribution: zipfian

recordcount=100000000
operationcount=10000000
workload=com.yahoo.ycsb.workloads.CoreWorkload

readallfields=true

readproportion=1
updateproportion=0
scanproportion=0
insertproportion=0

requestdistribution=zipfian





================================================
FILE: bench/workload_gen/ycsb_download.sh
================================================
mkdir ../workloads
curl -O --location https://github.com/brianfrankcooper/YCSB/releases/download/0.12.0/ycsb-0.12.0.tar.gz
tar xfvz ycsb-0.12.0.tar.gz
rm ycsb-0.12.0.tar.gz
mv ycsb-0.12.0 YCSB


================================================
FILE: bench/workload_multi_thread.cpp
================================================
#include "bench.hpp"
#include "filter_factory.hpp"

//#define VERBOSE 1

static std::vector<std::string> txn_keys;
static std::vector<std::string> upper_bound_keys;

typedef struct ThreadArg {
    int thread_id;
    bench::Filter* filter;
    int start_pos;
    int end_pos;
    int query_type;
    int64_t out_positives;
    double tput;
} ThreadArg;

void* execute_workload(void* arg) {
    ThreadArg* thread_arg = (ThreadArg*)arg;
    int64_t positives = 0;
    double start_time = bench::getNow();
    if (thread_arg->query_type == 0) { // point
	for (int i = thread_arg->start_pos; i < thread_arg->end_pos; i++)
	    positives += (int)thread_arg->filter->lookup(txn_keys[i]);
    } else { // range
	for (int i = thread_arg->start_pos; i < thread_arg->end_pos; i++)
	    positives += (int)thread_arg->filter->lookupRange(txn_keys[i], 
							      upper_bound_keys[i]);
    }
    double end_time = bench::getNow();
    double tput = (thread_arg->end_pos - thread_arg->start_pos) / (end_time - start_time) / 1000000; // Mops/sec

#ifdef VERBOSE
    std::cout << "Thread #" << thread_arg->thread_id << bench::kGreen 
    	      << ": Throughput = " << bench::kNoColor << tput << "\n";
#else
    std::cout << tput << "\n";
#endif

    thread_arg->out_positives = positives;
    thread_arg->tput = tput;
    pthread_exit(NULL);
    return NULL;
}

int main(int argc, char *argv[]) {
    if (argc != 10) {
	std::cout << "Usage:\n";
	std::cout << "1. filter type: SuRF, SuRFHash, SuRFReal, Bloom\n";
	std::cout << "2. suffix length: 0 < len <= 64 (for SuRFHash and SuRFReal only)\n";
	std::cout << "3. workload type: mixed, alterByte (only for email key)\n";
	std::cout << "4. percentage of keys inserted: 0 < num <= 100\n";
	std::cout << "5. byte position (conting from last, only for alterByte): num\n";
	std::cout << "6. key type: randint, email\n";
	std::cout << "7. query type: point, range\n";
	std::cout << "8. distribution: uniform, zipfian, latest\n";
	std::cout << "9. number of threads\n";
	return -1;
    }

    std::string filter_type = argv[1];
    uint32_t suffix_len = (uint32_t)atoi(argv[2]);
    std::string workload_type = argv[3];
    unsigned percent = atoi(argv[4]);
    unsigned byte_pos = atoi(argv[5]);
    std::string key_type = argv[6];
    std::string query_type = argv[7];
    std::string distribution = argv[8];
    int num_threads = atoi(argv[9]);

    // check args ====================================================
    if (filter_type.compare(std::string("SuRF")) != 0
	&& filter_type.compare(std::string("SuRFHash")) != 0
	&& filter_type.compare(std::string("SuRFReal")) != 0
	&& filter_type.compare(std::string("Bloom")) != 0
	&& filter_type.compare(std::string("ARF")) != 0) {
	std::cout << bench::kRed << "WRONG filter type\n" << bench::kNoColor;
	return -1;
    }

    if (suffix_len == 0 || suffix_len > 64) {
	std::cout << bench::kRed << "WRONG suffix length\n" << bench::kNoColor;
	return -1;
    }

    if (workload_type.compare(std::string("mixed")) != 0
	&& workload_type.compare(std::string("alterByte")) == 0) {
	std::cout << bench::kRed << "WRONG workload type\n" << bench::kNoColor;
	return -1;
    }

    if (percent > 100) {
	std::cout << bench::kRed << "WRONG percentage\n" << bench::kNoColor;
	return -1;
    }

    if (key_type.compare(std::string("randint")) != 0
	&& key_type.compare(std::string("timestamp")) != 0
	&& key_type.compare(std::string("email")) != 0) {
	std::cout << bench::kRed << "WRONG key type\n" << bench::kNoColor;
	return -1;
    }

    if (query_type.compare(std::string("point")) != 0
	&& query_type.compare(std::string("range")) != 0) {
	std::cout << bench::kRed << "WRONG query type\n" << bench::kNoColor;
	return -1;
    }

    if (distribution.compare(std::string("uniform")) != 0
	&& distribution.compare(std::string("zipfian")) != 0
	&& distribution.compare(std::string("latest")) != 0) {
	std::cout << bench::kRed << "WRONG distribution\n" << bench::kNoColor;
	return -1;
    }

    // load keys from files =======================================
    std::string load_file = "workloads/load_";
    load_file += key_type;
    std::vector<std::string> load_keys;
    if (key_type.compare(std::string("email")) == 0)
	bench::loadKeysFromFile(load_file, false, load_keys);
    else
	bench::loadKeysFromFile(load_file, true, load_keys);

    std::string txn_file = "workloads/txn_";
    txn_file += key_type;
    txn_file += "_";
    txn_file += distribution;

    if (key_type.compare(std::string("email")) == 0)
	bench::loadKeysFromFile(txn_file, false, txn_keys);
    else
	bench::loadKeysFromFile(txn_file, true, txn_keys);

    std::vector<std::string> insert_keys;
    bench::selectKeysToInsert(percent, insert_keys, load_keys);

    if (workload_type.compare(std::string("alterByte")) == 0)
	bench::modifyKeyByte(txn_keys, byte_pos);

    // compute upperbound keys for range queries =================
    if (query_type.compare(std::string("range")) == 0) {
	for (int i = 0; i < (int)txn_keys.size(); i++)
	    upper_bound_keys.push_back(bench::getUpperBoundKey(key_type, txn_keys[i]));
    }

    // create filter ==============================================
    bench::Filter* filter = bench::FilterFactory::createFilter(filter_type, suffix_len, insert_keys);

#ifdef VERBOSE
    std::cout << bench::kGreen << "Memory = " << bench::kNoColor << filter->getMemoryUsage() << std::endl;
#endif

    // execute transactions =======================================
    pthread_t* threads = new pthread_t[num_threads];
    pthread_attr_t attr;
    // Initialize and set thread joinable
    pthread_attr_init(&attr);
    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
    
    ThreadArg* thread_args = new ThreadArg[num_threads];
    int num_txns = (int)txn_keys.size();
    int num_txns_per_thread = num_txns / num_threads;
    for (int i = 0; i < num_threads; i++) {
	thread_args[i].thread_id = i;
	thread_args[i].filter = filter;
	thread_args[i].start_pos = num_txns_per_thread * i;
	thread_args[i].end_pos = num_txns_per_thread * (i + 1);
	if (query_type.compare(std::string("point")) == 0)
	    thread_args[i].query_type = 0;
	else
	    thread_args[i].query_type = 1;
	thread_args[i].out_positives = 0;
	thread_args[i].tput = 0;
    }

    for (int i = 0; i < num_threads; i++) {
	int rc = pthread_create(&threads[i], NULL, execute_workload, (void*)(&thread_args[i]));
	if (rc) {
	    std::cout << "Error: unable to create thread " << rc << std::endl;
	    exit(-1);
	}
    }

    // free attribute and wait for the other threads
    pthread_attr_destroy(&attr);
    for (int i = 0; i < num_threads; i++) {
	void* status;
	int rc = pthread_join(threads[i], &status);
	if (rc) {
	    std::cout << "Error:unable to join " << rc << endl;
	    exit(-1);
	}
    }

    double tput = 0;
    for (int i = 0; i < num_threads; i++) {
	tput += thread_args[i].tput;
    }

#ifdef VERBOSE
    std::cout << bench::kGreen << "Throughput = " << bench::kNoColor << tput << "\n";

    int positives = 0;
    for (int i = 0; i < num_threads; i++) {
	positives += (thread_args[i].out_positives);
    }

    // compute true positives ======================================
    std::map<std::string, bool> ht;
    for (int i = 0; i < (int)insert_keys.size(); i++)
	ht[insert_keys[i]] = true;

    int64_t true_positives = 0;
    std::map<std::string, bool>::iterator ht_iter;
    if (query_type.compare(std::string("point")) == 0) {
	for (int i = 0; i < (int)txn_keys.size(); i++) {
	    ht_iter = ht.find(txn_keys[i]);
	    true_positives += (ht_iter != ht.end());
	}
    } else if (query_type.compare(std::string("range")) == 0) {
	for (int i = 0; i < (int)txn_keys.size(); i++) {
	    ht_iter = ht.upper_bound(txn_keys[i]);
	    if (ht_iter != ht.end()) {
		std::string fetched_key = ht_iter->first;
		true_positives += (fetched_key.compare(upper_bound_keys[i]) < 0);
	    }
	}
    }
    int64_t false_positives = positives - true_positives;
    assert(false_positives >= 0);
    int64_t true_negatives = txn_keys.size() - true_positives;
    double fp_rate = 0;
    if (false_positives > 0)
	fp_rate = false_positives / (true_negatives + false_positives + 0.0);

    std::cout << "positives = " << positives << "\n";
    std::cout << "true positives = " << true_positives << "\n";
    std::cout << "false positives = " << false_positives << "\n";
    std::cout << "true negatives = " << true_negatives << "\n";
    std::cout << bench::kGreen << "False Positive Rate = " << bench::kNoColor << fp_rate << "\n";
#else
    std::cout << tput << "\n";
    std::cout << bench::kGreen << bench::kNoColor << "\n\n";
#endif

    delete[] threads;
    delete[] thread_args;

    pthread_exit(NULL);
    return 0;
}


================================================
FILE: include/bitvector.hpp
================================================
#ifndef BITVECTOR_H_
#define BITVECTOR_H_

#include <assert.h>

#include <vector>

#include "config.hpp"

namespace surf {

class Bitvector {
public:
    Bitvector() : num_bits_(0), bits_(nullptr) {};

    Bitvector(const std::vector<std::vector<word_t> >& bitvector_per_level, 
	      const std::vector<position_t>& num_bits_per_level, 
	      const level_t start_level = 0, 
	      level_t end_level = 0/* non-inclusive */) {
	if (end_level == 0)
	    end_level = bitvector_per_level.size();
	num_bits_ = totalNumBits(num_bits_per_level, start_level, end_level);
	bits_ = new word_t[numWords()];
	memset(bits_, 0, bitsSize());
	concatenateBitvectors(bitvector_per_level, num_bits_per_level, start_level, end_level);
    }

    ~Bitvector() {}

    position_t numBits() const {
	return num_bits_;
    }

    position_t numWords() const {
	if (num_bits_ % kWordSize == 0)
	    return (num_bits_ / kWordSize);
	else
	    return (num_bits_ / kWordSize + 1);
    }

    // in bytes
    position_t bitsSize() const {
	return (numWords() * (kWordSize / 8));
    }

    // in bytes
    position_t size() const {
	return (sizeof(Bitvector) + bitsSize());
    }

    bool readBit(const position_t pos) const;

    position_t distanceToNextSetBit(const position_t pos) const;
    position_t distanceToPrevSetBit(const position_t pos) const;

private:
    position_t totalNumBits(const std::vector<position_t>& num_bits_per_level, 
			    const level_t start_level, 
			    const level_t end_level/* non-inclusive */);

    void concatenateBitvectors(const std::vector<std::vector<word_t> >& bitvector_per_level, 
			       const std::vector<position_t>& num_bits_per_level, 
			       const level_t start_level, 
			       const level_t end_level/* non-inclusive */);
protected:
    position_t num_bits_;
    word_t* bits_;
};

bool Bitvector::readBit (const position_t pos) const {
    assert(pos <= num_bits_);
    position_t word_id = pos / kWordSize;
    position_t offset = pos & (kWordSize - 1);
    return bits_[word_id] & (kMsbMask >> offset);
}

position_t Bitvector::distanceToNextSetBit (const position_t pos) const {
    assert(pos < num_bits_);
    position_t distance = 1;

    position_t word_id = (pos + 1) / kWordSize;
    position_t offset = (pos + 1) % kWordSize;

    //first word left-over bits
    word_t test_bits = bits_[word_id] << offset;
    if (test_bits > 0) {
	return (distance + __builtin_clzll(test_bits));
    } else {
	if (word_id == numWords() - 1)
	    return (num_bits_ - pos);
	distance += (kWordSize - offset);
    }

    while (word_id < numWords() - 1) {
	word_id++;
	test_bits = bits_[word_id];
	if (test_bits > 0)
	    return (distance + __builtin_clzll(test_bits));
	distance += kWordSize;
    }
    return distance;
}

position_t Bitvector::distanceToPrevSetBit (const position_t pos) const {
    assert(pos <= num_bits_);
    if (pos == 0) return 0;
    position_t distance = 1;

    position_t word_id = (pos - 1) / kWordSize;
    position_t offset = (pos - 1) % kWordSize;

    //first word left-over bits
    word_t test_bits = bits_[word_id] >> (kWordSize - 1 - offset);
    if (test_bits > 0) {
	return (distance + __builtin_ctzll(test_bits));
    } else {
	//if (word_id == 0)
	//return (offset + 1);
	distance += (offset + 1);
    }

    while (word_id > 0) {
	word_id--;
	test_bits = bits_[word_id];
	if (test_bits > 0)
	    return (distance + __builtin_ctzll(test_bits));
	distance += kWordSize;
    }
    return distance;
}

position_t Bitvector::totalNumBits(const std::vector<position_t>& num_bits_per_level, 
			     const level_t start_level, 
			     const level_t end_level/* non-inclusive */) {
    position_t num_bits = 0;
    for (level_t level = start_level; level < end_level; level++)
	num_bits += num_bits_per_level[level];
    return num_bits;
}

void Bitvector::concatenateBitvectors(const std::vector<std::vector<word_t> >& bitvector_per_level, 
				      const std::vector<position_t>& num_bits_per_level, 
				      const level_t start_level, 
				      const level_t end_level/* non-inclusive */) {
    position_t bit_shift = 0;
    position_t word_id = 0;
    for (level_t level = start_level; level < end_level; level++) {
	if (num_bits_per_level[level] == 0) continue;
	position_t num_complete_words = num_bits_per_level[level] / kWordSize;
	for (position_t word = 0; word < num_complete_words; word++) {
	    bits_[word_id] |= (bitvector_per_level[level][word] >> bit_shift);
	    word_id++;
	    if (bit_shift > 0)
		bits_[word_id] |= (bitvector_per_level[level][word] << (kWordSize - bit_shift));
	}

	word_t bits_remain = num_bits_per_level[level] - num_complete_words * kWordSize;
	if (bits_remain > 0) {
	    word_t last_word = bitvector_per_level[level][num_complete_words];
	    bits_[word_id] |= (last_word >> bit_shift);
	    if (bit_shift + bits_remain < kWordSize) {
		bit_shift += bits_remain;
	    } else {
		word_id++;
		bits_[word_id] |= (last_word << (kWordSize - bit_shift));
		bit_shift = bit_shift + bits_remain - kWordSize;
	    }
	}
    }
}

} // namespace surf

#endif // BITVECTOR_H_


================================================
FILE: include/config.hpp
================================================
#ifndef CONFIG_H_
#define CONFIG_H_

#include <stdint.h>
#include <string.h>

namespace surf {

using level_t = uint32_t;
using position_t = uint32_t;
static const position_t kMaxPos = UINT32_MAX;

using label_t = uint8_t;
static const position_t kFanout = 256;

using word_t = uint64_t;
static const unsigned kWordSize = 64;
static const word_t kMsbMask = 0x8000000000000000;
static const word_t kOneMask = 0xFFFFFFFFFFFFFFFF;

static const bool kIncludeDense = true;
//static const uint32_t kSparseDenseRatio = 64;
static const uint32_t kSparseDenseRatio = 16;
static const label_t kTerminator = 255;

static const int kHashShift = 7;

static const int kCouldBePositive = 2018; // used in suffix comparison

enum SuffixType {
    kNone = 0,
    kHash = 1,
    kReal = 2,
    kMixed = 3
};

void align(char*& ptr) {
    ptr = (char*)(((uint64_t)ptr + 7) & ~((uint64_t)7));
}

void sizeAlign(position_t& size) {
    size = (size + 7) & ~((position_t)7);
}

void sizeAlign(uint64_t& size) {
    size = (size + 7) & ~((uint64_t)7);
}

std::string uint64ToString(const uint64_t word) {
    uint64_t endian_swapped_word = __builtin_bswap64(word);
    return std::string(reinterpret_cast<const char*>(&endian_swapped_word), 8);
}

uint64_t stringToUint64(const std::string& str_word) {
    uint64_t int_word = 0;
    memcpy(reinterpret_cast<char*>(&int_word), str_word.data(), 8);
    return __builtin_bswap64(int_word);
}

} // namespace surf

#endif // CONFIG_H_


================================================
FILE: include/hash.hpp
================================================
#ifndef HASH_H_
#define HASH_H_

#include <string>

namespace surf {

//******************************************************
//HASH FUNCTION FROM LEVELDB
//******************************************************
inline uint32_t DecodeFixed32(const char* ptr) {
    uint32_t result;
    memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
    return result;
}

inline uint32_t Hash(const char* data, size_t n, uint32_t seed) {
    // Similar to murmur hash
    const uint32_t m = 0xc6a4a793;
    const uint32_t r = 24;
    const char* limit = data + n;
    uint32_t h = seed ^ (n * m);

    // Pick up four bytes at a time
    while (data + 4 <= limit) {
	uint32_t w = DecodeFixed32(data);
	data += 4;
	h += w;
	h *= m;
	h ^= (h >> 16);
    }

    // Pick up remaining bytes
    switch (limit - data) {
    case 3:
	h += static_cast<unsigned char>(data[2]) << 16;
    case 2:
	h += static_cast<unsigned char>(data[1]) << 8;
    case 1:
	h += static_cast<unsigned char>(data[0]);
	h *= m;
	h ^= (h >> r);
	break;
    }
    return h;
}

inline uint32_t suffixHash(const std::string &key) {
    return Hash(key.c_str(), key.size(), 0xbc9f1d34);
}

inline uint32_t suffixHash(const char* key, const int keylen) {
    return Hash(key, keylen, 0xbc9f1d34);
}

} // namespace surf

#endif // HASH_H_



================================================
FILE: include/label_vector.hpp
================================================
#ifndef LABELVECTOR_H_
#define LABELVECTOR_H_

#include <emmintrin.h>

#include <vector>

#include "config.hpp"

namespace surf {

class LabelVector {
public:
    LabelVector() : num_bytes_(0), labels_(nullptr) {};

    LabelVector(const std::vector<std::vector<label_t> >& labels_per_level,
		const level_t start_level = 0,
		level_t end_level = 0/* non-inclusive */) {
	if (end_level == 0)
	    end_level = labels_per_level.size();

	num_bytes_ = 1;
	for (level_t level = start_level; level < end_level; level++)
	    num_bytes_ += labels_per_level[level].size();

	//labels_ = new label_t[num_bytes_];
	position_t alloc_bytes = num_bytes_ * (num_bytes_ / kWordSize + 1);
	labels_ = new label_t[alloc_bytes];
	for (position_t i = 0; i < alloc_bytes; i++)
	  labels_[i] = 0;

	position_t pos = 0;
	for (level_t level = start_level; level < end_level; level++) {
	    for (position_t idx = 0; idx < labels_per_level[level].size(); idx++) {
		labels_[pos] = labels_per_level[level][idx];
		pos++;
	    }
	}
    }

    ~LabelVector() {}

    position_t getNumBytes() const {
	return num_bytes_;
    }

    position_t serializedSize() const {
	position_t size = sizeof(num_bytes_) + num_bytes_;
	sizeAlign(size);
	return size;
    }

    position_t size() const {
	return (sizeof(LabelVector) + num_bytes_);
    }

    label_t read(const position_t pos) const {
	return labels_[pos];
    }

    label_t operator[](const position_t pos) const {
	return labels_[pos];
    }

    bool search(const label_t target, position_t& pos, const position_t search_len) const;
    bool searchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const;

    bool binarySearch(const label_t target, position_t& pos, const position_t search_len) const;
    bool simdSearch(const label_t target, position_t& pos, const position_t search_len) const;
    bool linearSearch(const label_t target, position_t& pos, const position_t search_len) const;

    bool binarySearchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const;
    bool linearSearchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const;

    void serialize(char*& dst) const {
	memcpy(dst, &num_bytes_, sizeof(num_bytes_));
	dst += sizeof(num_bytes_);
	memcpy(dst, labels_, num_bytes_);
	dst += num_bytes_;
	align(dst);
    }
    
    static LabelVector* deSerialize(char*& src) {
	LabelVector* lv = new LabelVector();
	memcpy(&(lv->num_bytes_), src, sizeof(lv->num_bytes_));
	src += sizeof(lv->num_bytes_);
	
	lv->labels_ = new label_t[lv->num_bytes_];
	memcpy(lv->labels_, src, lv->num_bytes_);
	src += lv->num_bytes_;
	
	//lv->labels_ = const_cast<label_t*>(reinterpret_cast<const label_t*>(src));
	//src += lv->num_bytes_;
	align(src);
	return lv;
    }

    void destroy() {
	delete[] labels_;	
    }

private:
    position_t num_bytes_;
    label_t* labels_;
};

bool LabelVector::search(const label_t target, position_t& pos, position_t search_len) const {
    //skip terminator label
    if ((search_len > 1) && (labels_[pos] == kTerminator)) {
	pos++;
	search_len--;
    }

    if (search_len < 3)
	return linearSearch(target, pos, search_len);
    if (search_len < 12)
	return binarySearch(target, pos, search_len);
    else
	return simdSearch(target, pos, search_len);
}

bool LabelVector::searchGreaterThan(const label_t target, position_t& pos, position_t search_len) const {
    //skip terminator label
    if ((search_len > 1) && (labels_[pos] == kTerminator)) {
	pos++;
	search_len--;
    }

    if (search_len < 3)
	return linearSearchGreaterThan(target, pos, search_len);
    else
	return binarySearchGreaterThan(target, pos, search_len);
}

bool LabelVector::binarySearch(const label_t target, position_t& pos, const position_t search_len) const {
    position_t l = pos;
    position_t r = pos + search_len;
    while (l < r) {
	position_t m = (l + r) >> 1;
	if (target < labels_[m]) {
	    r = m;
	} else if (target == labels_[m]) {
	    pos = m;
	    return true;
	} else {
	    l = m + 1;
	}
    }
    return false;
}

bool LabelVector::simdSearch(const label_t target, position_t& pos, const position_t search_len) const {
    position_t num_labels_searched = 0;
    position_t num_labels_left = search_len;
    while ((num_labels_left >> 4) > 0) {
	label_t* start_ptr = labels_ + pos + num_labels_searched;
	__m128i cmp = _mm_cmpeq_epi8(_mm_set1_epi8(target), 
				     _mm_loadu_si128(reinterpret_cast<__m128i*>(start_ptr)));
	unsigned check_bits = _mm_movemask_epi8(cmp);
	if (check_bits) {
	    pos += (num_labels_searched + __builtin_ctz(check_bits));
	    return true;
	}
	num_labels_searched += 16;
	num_labels_left -= 16;
    }

    if (num_labels_left > 0) {
	label_t* start_ptr = labels_ + pos + num_labels_searched;
	__m128i cmp = _mm_cmpeq_epi8(_mm_set1_epi8(target), 
				     _mm_loadu_si128(reinterpret_cast<__m128i*>(start_ptr)));
	unsigned leftover_bits_mask = (1 << num_labels_left) - 1;
	unsigned check_bits = _mm_movemask_epi8(cmp) & leftover_bits_mask;
	if (check_bits) {
	    pos += (num_labels_searched + __builtin_ctz(check_bits));
	    return true;
	}
    }

    return false;
}

bool LabelVector::linearSearch(const label_t target, position_t&  pos, const position_t search_len) const {
    for (position_t i = 0; i < search_len; i++) {
	if (target == labels_[pos + i]) {
	    pos += i;
	    return true;
	}
    }
    return false;
}

bool LabelVector::binarySearchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const {
    position_t l = pos;
    position_t r = pos + search_len;
    while (l < r) {
	position_t m = (l + r) >> 1;
	if (target < labels_[m]) {
	    r = m;
	} else if (target == labels_[m]) {
	    if (m < pos + search_len - 1) {
		pos = m + 1;
		return true;
	    }
	    return false;
	} else {
	    l = m + 1;
	}
    }

    if (l < pos + search_len) {
	pos = l;
	return true;
    }
    return false;
}

bool LabelVector::linearSearchGreaterThan(const label_t target, position_t& pos, const position_t search_len) const {
    for (position_t i = 0; i < search_len; i++) {
	if (labels_[pos + i] > target) {
	    pos += i;
	    return true;
	}
    }
    return false;
}

} // namespace surf

#endif // LABELVECTOR_H_


================================================
FILE: include/louds_dense.hpp
================================================
#ifndef LOUDSDENSE_H_
#define LOUDSDENSE_H_

#include <string>

#include "config.hpp"
#include "rank.hpp"
#include "suffix.hpp"
#include "surf_builder.hpp"

namespace surf {

class LoudsDense {
public:
    class Iter {
    public:
	Iter() : is_valid_(false) {};
	Iter(LoudsDense* trie) : is_valid_(false), is_search_complete_(false),
				 is_move_left_complete_(false),
				 is_move_right_complete_(false),
				 trie_(trie),
				 send_out_node_num_(0), key_len_(0),
				 is_at_prefix_key_(false) {
	    for (level_t level = 0; level < trie_->getHeight(); level++) {
		key_.push_back(0);
		pos_in_trie_.push_back(0);
	    }
	}

	void clear();
	bool isValid() const { return is_valid_; };
	bool isSearchComplete() const { return is_search_complete_; };
	bool isMoveLeftComplete() const { return is_move_left_complete_; };
	bool isMoveRightComplete() const { return is_move_right_complete_; };
	bool isComplete() const {
	    return (is_search_complete_ &&
		    (is_move_left_complete_ && is_move_right_complete_));
	}

	int compare(const std::string& key) const;
	std::string getKey() const;
	int getSuffix(word_t* suffix) const;
	std::string getKeyWithSuffix(unsigned* bitlen) const;
	position_t getSendOutNodeNum() const { return send_out_node_num_; };

	void setToFirstLabelInRoot();
	void setToLastLabelInRoot();
	void moveToLeftMostKey();
	void moveToRightMostKey();
	void operator ++(int);
	void operator --(int);

    private:
	inline void append(position_t pos);
	inline void set(level_t level, position_t pos);
	inline void setSendOutNodeNum(position_t node_num) { send_out_node_num_ = node_num; };
	inline void setFlags(const bool is_valid, const bool is_search_complete, 
			     const bool is_move_left_complete,
			     const bool is_move_right_complete);

    private:
	// True means the iter either points to a valid key 
	// or to a prefix with length trie_->getHeight()
	bool is_valid_;
	// If false, call moveToKeyGreaterThan in LoudsSparse to complete
	bool is_search_complete_; 
	// If false, call moveToLeftMostKey in LoudsSparse to complete
	bool is_move_left_complete_;
	// If false, call moveToRightMostKey in LoudsSparse to complete
	bool is_move_right_complete_; 
	LoudsDense* trie_;
	position_t send_out_node_num_;
	level_t key_len_; // Does NOT include suffix

	std::vector<label_t> key_;
	std::vector<position_t> pos_in_trie_;
	bool is_at_prefix_key_;

	friend class LoudsDense;
    };

public:
    LoudsDense() {};
    LoudsDense(const SuRFBuilder* builder);

    ~LoudsDense() {}

    // Returns whether key exists in the trie so far
    // out_node_num == 0 means search terminates in louds-dense.
    bool lookupKey(const std::string& key, position_t& out_node_num) const;
    // return value indicates potential false positive
    bool moveToKeyGreaterThan(const std::string& key, 
			      const bool inclusive, LoudsDense::Iter& iter) const;
    uint64_t approxCount(const LoudsDense::Iter* iter_left,
			 const LoudsDense::Iter* iter_right,
			 position_t& out_node_num_left,
			 position_t& out_node_num_right) const;

    uint64_t getHeight() const { return height_; };
    uint64_t serializedSize() const;
    uint64_t getMemoryUsage() const;

    void serialize(char*& dst) const {
	memcpy(dst, &height_, sizeof(height_));
	dst += sizeof(height_);
	memcpy(dst, level_cuts_, sizeof(position_t) * height_);
	dst += (sizeof(position_t) * height_);
	align(dst);
	label_bitmaps_->serialize(dst);
	child_indicator_bitmaps_->serialize(dst);
	prefixkey_indicator_bits_->serialize(dst);
	suffixes_->serialize(dst);
	align(dst);
    }

    static LoudsDense* deSerialize(char*& src) {
	LoudsDense* louds_dense = new LoudsDense();
	memcpy(&(louds_dense->height_), src, sizeof(louds_dense->height_));
	src += sizeof(louds_dense->height_);
	louds_dense->level_cuts_ = new position_t[louds_dense->height_];
	memcpy(louds_dense->level_cuts_, src,
	       sizeof(position_t) * (louds_dense->height_));
	src += (sizeof(position_t) * (louds_dense->height_));
	align(src);
	louds_dense->label_bitmaps_ = BitvectorRank::deSerialize(src);
	louds_dense->child_indicator_bitmaps_ = BitvectorRank::deSerialize(src);
	louds_dense->prefixkey_indicator_bits_ = BitvectorRank::deSerialize(src);
	louds_dense->suffixes_ = BitvectorSuffix::deSerialize(src);
	align(src);
	return louds_dense;
    }

    void destroy() {
	label_bitmaps_->destroy();
	child_indicator_bitmaps_->destroy();
	prefixkey_indicator_bits_->destroy();
	suffixes_->destroy();
    }

private:
    position_t getChildNodeNum(const position_t pos) const;
    position_t getSuffixPos(const position_t pos, const bool is_prefix_key) const;
    position_t getNextPos(const position_t pos) const;
    position_t getPrevPos(const position_t pos, bool* is_out_of_bound) const;

    bool compareSuffixGreaterThan(const position_t pos, const std::string& key, 
				  const level_t level, const bool inclusive, 
				  LoudsDense::Iter& iter) const;
    void extendPosList(std::vector<position_t>& pos_list,
		       position_t& out_node_num) const;

private:
    static const position_t kNodeFanout = 256;
    static const position_t kRankBasicBlockSize  = 512;

    level_t height_;
    position_t* level_cuts_; // position of the last bit at each level

    BitvectorRank* label_bitmaps_;
    BitvectorRank* child_indicator_bitmaps_;
    BitvectorRank* prefixkey_indicator_bits_; //1 bit per internal node
    BitvectorSuffix* suffixes_;
};


LoudsDense::LoudsDense(const SuRFBuilder* builder) {
    height_ = builder->getSparseStartLevel();
    std::vector<position_t> num_bits_per_level;
    for (level_t level = 0; level < height_; level++)
	num_bits_per_level.push_back(builder->getBitmapLabels()[level].size() * kWordSize);

    level_cuts_ = new position_t[height_];
    position_t bit_count = 0;
    for (level_t level = 0; level < height_; level++) {
	bit_count += num_bits_per_level[level];
	level_cuts_[level] = bit_count - 1;
    }

    label_bitmaps_ = new BitvectorRank(kRankBasicBlockSize, builder->getBitmapLabels(),
				       num_bits_per_level, 0, height_);
    child_indicator_bitmaps_ = new BitvectorRank(kRankBasicBlockSize,
						 builder->getBitmapChildIndicatorBits(),
						 num_bits_per_level, 0, height_);
    prefixkey_indicator_bits_ = new BitvectorRank(kRankBasicBlockSize,
						  builder->getPrefixkeyIndicatorBits(),
						  builder->getNodeCounts(), 0, height_);

    if (builder->getSuffixType() == kNone) {
	suffixes_ = new BitvectorSuffix();
    } else {
	level_t hash_suffix_len = builder->getHashSuffixLen();
        level_t real_suffix_len = builder->getRealSuffixLen();
        level_t suffix_len = hash_suffix_len + real_suffix_len;
	std::vector<position_t> num_suffix_bits_per_level;
	for (level_t level = 0; level < height_; level++)
	    num_suffix_bits_per_level.push_back(builder->getSuffixCounts()[level] * suffix_len);
	suffixes_ = new BitvectorSuffix(builder->getSuffixType(), 
					hash_suffix_len, real_suffix_len,
                                        builder->getSuffixes(),
					num_suffix_bits_per_level, 0, height_);
    }
}

bool LoudsDense::lookupKey(const std::string& key, position_t& out_node_num) const {
    position_t node_num = 0;
    position_t pos = 0;
    for (level_t level = 0; level < height_; level++) {
	pos = (node_num * kNodeFanout);
	if (level >= key.length()) { //if run out of searchKey bytes
	    if (prefixkey_indicator_bits_->readBit(node_num)) //if the prefix is also a key
		return suffixes_->checkEquality(getSuffixPos(pos, true), key, level + 1);
	    else
		return false;
	}
	pos += (label_t)key[level];

	//child_indicator_bitmaps_->prefetch(pos);

	if (!label_bitmaps_->readBit(pos)) //if key byte does not exist
	    return false;

	if (!child_indicator_bitmaps_->readBit(pos)) //if trie branch terminates
	    return suffixes_->checkEquality(getSuffixPos(pos, false), key, level + 1);

	node_num = getChildNodeNum(pos);
    }
    //search will continue in LoudsSparse
    out_node_num = node_num;
    return true;
}

bool LoudsDense::moveToKeyGreaterThan(const std::string& key, 
				      const bool inclusive, LoudsDense::Iter& iter) const {
    position_t node_num = 0;
    position_t pos = 0;
    for (level_t level = 0; level < height_; level++) {
	// if is_at_prefix_key_, pos is at the next valid position in the child node
	pos = node_num * kNodeFanout;
	if (level >= key.length()) { // if run out of searchKey bytes
	    iter.append(getNextPos(pos - 1));
	    if (prefixkey_indicator_bits_->readBit(node_num)) //if the prefix is also a key
		iter.is_at_prefix_key_ = true;
	    else
		iter.moveToLeftMostKey();
	    // valid, search complete, moveLeft complete, moveRight complete
	    iter.setFlags(true, true, true, true); 
	    return true;
	}

	pos += (label_t)key[level];
	iter.append(pos);

	// if no exact match
	if (!label_bitmaps_->readBit(pos)) {
	    iter++;
	    return false;
	}
	//if trie branch terminates
	if (!child_indicator_bitmaps_->readBit(pos))
	    return compareSuffixGreaterThan(pos, key, level+1, inclusive, iter);
	node_num = getChildNodeNum(pos);
    }

    //search will continue in LoudsSparse
    iter.setSendOutNodeNum(node_num);
    // valid, search INCOMPLETE, moveLeft complete, moveRight complete
    iter.setFlags(true, false, true, true);
    return true;
}

void LoudsDense::extendPosList(std::vector<position_t>& pos_list,
			       position_t& out_node_num) const {
    position_t node_num = 0;
    position_t pos = pos_list[pos_list.size() - 1];
    for (level_t i = pos_list.size(); i < height_; i++) {
	node_num = getChildNodeNum(pos);
	if (!child_indicator_bitmaps_->readBit(pos))
	    node_num++;
	pos = (node_num * kNodeFanout);
	if (pos > level_cuts_[i]) {
	    pos = kMaxPos;
	    pos_list.push_back(pos);
	    break;
	}
	pos_list.push_back(pos);
    }
    if (pos == kMaxPos) {
	for (level_t i = pos_list.size(); i < height_; i++)
	    pos_list.push_back(pos);
	out_node_num = pos;
    } else {
	out_node_num = getChildNodeNum(pos);
	if (!child_indicator_bitmaps_->readBit(pos))
	    out_node_num++;
    }
}

uint64_t LoudsDense::approxCount(const LoudsDense::Iter* iter_left,
				 const LoudsDense::Iter* iter_right,
				 position_t& out_node_num_left,
				 position_t& out_node_num_right) const {
    std::vector<position_t> left_pos_list, right_pos_list;
    for (level_t i = 0; i < iter_left->key_len_; i++)
	left_pos_list.push_back(iter_left->pos_in_trie_[i]);
    level_t ori_left_len = left_pos_list.size();
    extendPosList(left_pos_list, out_node_num_left);
    
    for (level_t i = 0; i < iter_right->key_len_; i++)
	right_pos_list.push_back(iter_right->pos_in_trie_[i]);
    level_t ori_right_len = right_pos_list.size();
    extendPosList(right_pos_list, out_node_num_right);

    uint64_t count = 0;
    for (level_t i = 0; i < height_; i++) {
	position_t left_pos = left_pos_list[i];
	if (left_pos == kMaxPos) break;
	if (i == (ori_left_len - 1) && iter_left->is_at_prefix_key_)
	    left_pos = (left_pos / kNodeFanout) * kNodeFanout;
	position_t right_pos = right_pos_list[i];
	if (right_pos == kMaxPos)
	    right_pos = level_cuts_[i];
	if (i == (ori_right_len - 1) && iter_right->is_at_prefix_key_)
	    right_pos = (right_pos / kNodeFanout) * kNodeFanout;
	//assert(left_pos <= right_pos);
	if (left_pos < right_pos) {
	    if (i >= ori_left_len)
		left_pos = getNextPos(left_pos);
	    if (i >= ori_right_len && right_pos != level_cuts_[height_ - 1])
		right_pos = getNextPos(right_pos);
	    bool has_prefix_key_left
		= prefixkey_indicator_bits_->readBit(left_pos / kNodeFanout);
	    bool has_prefix_key_right
		= prefixkey_indicator_bits_->readBit(right_pos / kNodeFanout);
	    position_t rank_left_label = label_bitmaps_->rank(left_pos);
	    position_t rank_right_label = label_bitmaps_->rank(right_pos);
	    if (right_pos == level_cuts_[height_ - 1])
		rank_right_label++;
	    position_t rank_left_ind = child_indicator_bitmaps_->rank(left_pos);
	    position_t rank_right_ind = child_indicator_bitmaps_->rank(right_pos);
	    position_t rank_left_prefix
		= prefixkey_indicator_bits_->rank(left_pos / kNodeFanout);
	    position_t rank_right_prefix
		= prefixkey_indicator_bits_->rank(right_pos / kNodeFanout);
	    position_t num_leafs = (rank_right_label - rank_left_label)
		- (rank_right_ind - rank_left_ind)
		+ (rank_right_prefix - rank_left_prefix);
	    // offcount in child_indicators
	    if (child_indicator_bitmaps_->readBit(right_pos))
		num_leafs++;
	    if (child_indicator_bitmaps_->readBit(left_pos))
		num_leafs--;
	    // offcount in prefix keys
	    if (i >= ori_right_len && has_prefix_key_right)
		num_leafs--;
	    if (i >= ori_left_len && has_prefix_key_left)
		num_leafs++;
	    if (iter_left->is_search_complete_ && (i == ori_left_len - 1))
		num_leafs--;
	    count += num_leafs;
	}
    }
    return count;
}

uint64_t LoudsDense::serializedSize() const {
    uint64_t size = sizeof(height_)
	+ (sizeof(position_t) * height_);
    sizeAlign(size);
    size += (label_bitmaps_->serializedSize()
	     + child_indicator_bitmaps_->serializedSize()
	     + prefixkey_indicator_bits_->serializedSize()
	     + suffixes_->serializedSize());
    sizeAlign(size);
    return size;
}

uint64_t LoudsDense::getMemoryUsage() const {
    return (sizeof(LoudsDense)
	    + label_bitmaps_->size()
	    + child_indicator_bitmaps_->size()
	    + prefixkey_indicator_bits_->size()
	    + suffixes_->size());
}

position_t LoudsDense::getChildNodeNum(const position_t pos) const {
    return child_indicator_bitmaps_->rank(pos);
}

position_t LoudsDense::getSuffixPos(const position_t pos, const bool is_prefix_key) const {
    position_t node_num = pos / kNodeFanout;
    position_t suffix_pos = (label_bitmaps_->rank(pos)
			     - child_indicator_bitmaps_->rank(pos)
			     + prefixkey_indicator_bits_->rank(node_num)
			     - 1);
    if (is_prefix_key && label_bitmaps_->readBit(pos) && !child_indicator_bitmaps_->readBit(pos))
	suffix_pos--;
    return suffix_pos;
}

position_t LoudsDense::getNextPos(const position_t pos) const {
    return pos + label_bitmaps_->distanceToNextSetBit(pos);
}

position_t LoudsDense::getPrevPos(const position_t pos, bool* is_out_of_bound) const {
    position_t distance = label_bitmaps_->distanceToPrevSetBit(pos);
    if (pos <= distance) {
	*is_out_of_bound = true;
	return 0;
    }
    *is_out_of_bound = false;
    return (pos - distance);
}

bool LoudsDense::compareSuffixGreaterThan(const position_t pos, const std::string& key, 
					  const level_t level, const bool inclusive, 
					  LoudsDense::Iter& iter) const {
    position_t suffix_pos = getSuffixPos(pos, false);
    int compare = suffixes_->compare(suffix_pos, key, level);
    if ((compare != kCouldBePositive) && (compare < 0)) {
	iter++;
	return false;
    }
    // valid, search complete, moveLeft complete, moveRight complete
    iter.setFlags(true, true, true, true);
    return true;
}

//============================================================================

void LoudsDense::Iter::clear() {
    is_valid_ = false;
    key_len_ = 0;
    is_at_prefix_key_ = false;
}

int LoudsDense::Iter::compare(const std::string& key) const {
    if (is_at_prefix_key_ && (key_len_ - 1) < key.length())
	return -1;
    std::string iter_key = getKey();
    std::string key_dense = key.substr(0, iter_key.length());
    int compare = iter_key.compare(key_dense);
    if (compare != 0) return compare;
    if (isComplete()) {
	position_t suffix_pos = trie_->getSuffixPos(pos_in_trie_[key_len_ - 1], is_at_prefix_key_);
	return trie_->suffixes_->compare(suffix_pos, key, key_len_);
    }
    return compare;
}

std::string LoudsDense::Iter::getKey() const {
    if (!is_valid_)
	return std::string();
    level_t len = key_len_;
    if (is_at_prefix_key_)
	len--;
    return std::string((const char*)key_.data(), (size_t)len);
}

int LoudsDense::Iter::getSuffix(word_t* suffix) const {
    if (isComplete()
        && ((trie_->suffixes_->getType() == kReal) || (trie_->suffixes_->getType() == kMixed))) {
	position_t suffix_pos = trie_->getSuffixPos(pos_in_trie_[key_len_ - 1], is_at_prefix_key_);
	*suffix = trie_->suffixes_->readReal(suffix_pos);
	return trie_->suffixes_->getRealSuffixLen();
    }
    *suffix = 0;
    return 0;
}

std::string LoudsDense::Iter::getKeyWithSuffix(unsigned* bitlen) const {
    std::string iter_key = getKey();
    if (isComplete()
        && ((trie_->suffixes_->getType() == kReal) || (trie_->suffixes_->getType() == kMixed))) {
	position_t suffix_pos = trie_->getSuffixPos(pos_in_trie_[key_len_ - 1], is_at_prefix_key_);
	word_t suffix = trie_->suffixes_->readReal(suffix_pos);
	if (suffix > 0) {
	    level_t suffix_len = trie_->suffixes_->getRealSuffixLen();
	    *bitlen = suffix_len % 8;
	    suffix <<= (64 - suffix_len);
	    char* suffix_str = reinterpret_cast<char*>(&suffix);
	    suffix_str += 7;
	    unsigned pos = 0;
	    while (pos < suffix_len) {
		iter_key.append(suffix_str, 1);
		suffix_str--;
		pos += 8;
	    }
	}
    }
    return iter_key;
}

void LoudsDense::Iter::append(position_t pos) {
    assert(key_len_ < key_.size());
    key_[key_len_] = (label_t)(pos % kNodeFanout);
    pos_in_trie_[key_len_] = pos;
    key_len_++;
}

void LoudsDense::Iter::set(level_t level, position_t pos) {
    assert(level < key_.size());
    key_[level] = (label_t)(pos % kNodeFanout);
    pos_in_trie_[level] = pos;
}

void LoudsDense::Iter::setFlags(const bool is_valid,
				const bool is_search_complete, 
				const bool is_move_left_complete,
				const bool is_move_right_complete) {
    is_valid_ = is_valid;
    is_search_complete_ = is_search_complete;
    is_move_left_complete_ = is_move_left_complete;
    is_move_right_complete_ = is_move_right_complete;
}

void LoudsDense::Iter::setToFirstLabelInRoot() {
    if (trie_->label_bitmaps_->readBit(0)) {
	pos_in_trie_[0] = 0;
	key_[0] = (label_t)0;
    } else {
	pos_in_trie_[0] = trie_->getNextPos(0);
	key_[0] = (label_t)pos_in_trie_[0];
    }
    key_len_++;
}

void LoudsDense::Iter::setToLastLabelInRoot() {
    bool is_out_of_bound;
    pos_in_trie_[0] = trie_->getPrevPos(kNodeFanout, &is_out_of_bound);
    key_[0] = (label_t)pos_in_trie_[0];
    key_len_++;
}

void LoudsDense::Iter::moveToLeftMostKey() {
    assert(key_len_ > 0);
    level_t level = key_len_ - 1;
    position_t pos = pos_in_trie_[level];
    if (!trie_->child_indicator_bitmaps_->readBit(pos))
	// valid, search complete, moveLeft complete, moveRight complete
	return setFlags(true, true, true, true);

    while (level < trie_->getHeight() - 1) {
	position_t node_num = trie_->getChildNodeNum(pos);
	//if the current prefix is also a key
	if (trie_->prefixkey_indicator_bits_->readBit(node_num)) {
	    append(trie_->getNextPos(node_num * kNodeFanout - 1));
	    is_at_prefix_key_ = true;
	    // valid, search complete, moveLeft complete, moveRight complete
	    return setFlags(true, true, true, true);
	}

	pos = trie_->getNextPos(node_num * kNodeFanout - 1);
	append(pos);

	// if trie branch terminates
	if (!trie_->child_indicator_bitmaps_->readBit(pos))
	    // valid, search complete, moveLeft complete, moveRight complete
	    return setFlags(true, true, true, true);

	level++;
    }
    send_out_node_num_ = trie_->getChildNodeNum(pos);
    // valid, search complete, moveLeft INCOMPLETE, moveRight complete
    setFlags(true, true, false, true);
}

void LoudsDense::Iter::moveToRightMostKey() {
    assert(key_len_ > 0);
    level_t level = key_len_ - 1;
    position_t pos = pos_in_trie_[level];
    if (!trie_->child_indicator_bitmaps_->readBit(pos))
	// valid, search complete, moveLeft complete, moveRight complete
	return setFlags(true, true, true, true);

    while (level < trie_->getHeight() - 1) {
	position_t node_num = trie_->getChildNodeNum(pos);
	bool is_out_of_bound;
	pos = trie_->getPrevPos((node_num + 1) * kNodeFanout, &is_out_of_bound);
	if (is_out_of_bound) {
	    is_valid_ = false;
	    return;
	}
	append(pos);

	// if trie branch terminates
	if (!trie_->child_indicator_bitmaps_->readBit(pos))
	    // valid, search complete, moveLeft complete, moveRight complete
	    return setFlags(true, true, true, true);

	level++;
    }
    send_out_node_num_ = trie_->getChildNodeNum(pos);
    // valid, search complete, moveleft complete, moveRight INCOMPLETE
    setFlags(true, true, true, false);
}

void LoudsDense::Iter::operator ++(int) {
    assert(key_len_ > 0);
    if (is_at_prefix_key_) {
	is_at_prefix_key_ = false;
	return moveToLeftMostKey();
    }
    position_t pos = pos_in_trie_[key_len_ - 1];
    position_t next_pos = trie_->getNextPos(pos);
    // if crossing node boundary
    while ((next_pos / kNodeFanout) > (pos / kNodeFanout)) {
	key_len_--;
	if (key_len_ == 0) {
	    is_valid_ = false;
	    return;
	}
	pos = pos_in_trie_[key_len_ - 1];
	next_pos = trie_->getNextPos(pos);
    }
    set(key_len_ - 1, next_pos);
    return moveToLeftMostKey();
}

void LoudsDense::Iter::operator --(int) {
    assert(key_len_ > 0);
    if (is_at_prefix_key_) {
	is_at_prefix_key_ = false;
	key_len_--;
    }
    position_t pos = pos_in_trie_[key_len_ - 1];
    bool is_out_of_bound;
    position_t prev_pos = trie_->getPrevPos(pos, &is_out_of_bound);
    if (is_out_of_bound) {
	is_valid_ = false;
	return;
    }
    
    // if crossing node boundary
    while ((prev_pos / kNodeFanout) < (pos / kNodeFanout)) {
	//if the current prefix is also a key
	position_t node_num = pos / kNodeFanout;
	if (trie_->prefixkey_indicator_bits_->readBit(node_num)) {
	    is_at_prefix_key_ = true;
	    // valid, search complete, moveLeft complete, moveRight complete
	    return setFlags(true, true, true, true);
	}
	
	key_len_--;
	if (key_len_ == 0) {
	    is_valid_ = false;
	    return;
	}
	pos = pos_in_trie_[key_len_ - 1];
	prev_pos = trie_->getPrevPos(pos, &is_out_of_bound);
	if (is_out_of_bound) {
	    is_valid_ = false;
	    return;
	}
    }
    set(key_len_ - 1, prev_pos);
    return moveToRightMostKey();
}

} //namespace surf

#endif // LOUDSDENSE_H_


================================================
FILE: include/louds_sparse.hpp
================================================
#ifndef LOUDSSPARSE_H_
#define LOUDSSPARSE_H_

#include <string>

#include "config.hpp"
#include "label_vector.hpp"
#include "rank.hpp"
#include "select.hpp"
#include "suffix.hpp"
#include "surf_builder.hpp"

namespace surf {

class LoudsSparse {
public:
    class Iter {
    public:
	Iter() : is_valid_(false) {};
	Iter(LoudsSparse* trie) : is_valid_(false), trie_(trie), start_node_num_(0), 
				  key_len_(0), is_at_terminator_(false) {
	    start_level_ = trie_->getStartLevel();
	    for (level_t level = start_level_; level < trie_->getHeight(); level++) {
		key_.push_back(0);
		pos_in_trie_.push_back(0);
	    }
	}

	void clear();
	bool isValid() const { return is_valid_; };
	int compare(const std::string& key) const;
	std::string getKey() const;
        int getSuffix(word_t* suffix) const;
	std::string getKeyWithSuffix(unsigned* bitlen) const;

	position_t getStartNodeNum() const { return start_node_num_; };
	void setStartNodeNum(position_t node_num) { start_node_num_ = node_num; };

	void setToFirstLabelInRoot();
	void setToLastLabelInRoot();
	void moveToLeftMostKey();
	void moveToRightMostKey();
	void operator ++(int);
	void operator --(int);

    private:
	void append(const position_t pos);
	void append(const label_t label, const position_t pos);
	void set(const level_t level, const position_t pos);

    private:
	bool is_valid_; // True means the iter currently points to a valid key
	LoudsSparse* trie_;
	level_t start_level_;
	position_t start_node_num_; // Passed in by the dense iterator; default = 0
	level_t key_len_; // Start counting from start_level_; does NOT include suffix

	std::vector<label_t> key_;
	std::vector<position_t> pos_in_trie_;
	bool is_at_terminator_;

	friend class LoudsSparse;
    };

public:
    LoudsSparse() {};
    LoudsSparse(const SuRFBuilder* builder);

    ~LoudsSparse() {}

    // point query: trie walk starts at node "in_node_num" instead of root
    // in_node_num is provided by louds-dense's lookupKey function
    bool lookupKey(const std::string& key, const position_t in_node_num) const;
    // return value indicates potential false positive
    bool moveToKeyGreaterThan(const std::string& key, 
			      const bool inclusive, LoudsSparse::Iter& iter) const;
    uint64_t approxCount(const LoudsSparse::Iter* iter_left,
			 const LoudsSparse::Iter* iter_right,
			 const position_t in_node_num_left,
			 const position_t in_node_num_right) const;

    level_t getHeight() const { return height_; };
    level_t getStartLevel() const { return start_level_; };
    uint64_t serializedSize() const;
    uint64_t getMemoryUsage() const;

    void serialize(char*& dst) const {
	memcpy(dst, &height_, sizeof(height_));
	dst += sizeof(height_);
	memcpy(dst, &start_level_, sizeof(start_level_));
	dst += sizeof(start_level_);
	memcpy(dst, &node_count_dense_, sizeof(node_count_dense_));
	dst += sizeof(node_count_dense_);
	memcpy(dst, &child_count_dense_, sizeof(child_count_dense_));
	dst += sizeof(child_count_dense_);
	memcpy(dst, level_cuts_, sizeof(position_t) * height_);
	dst += (sizeof(position_t) * height_);
	align(dst);
	labels_->serialize(dst);
	child_indicator_bits_->serialize(dst);
	louds_bits_->serialize(dst);
	suffixes_->serialize(dst);
	align(dst);
    }

    static LoudsSparse* deSerialize(char*& src) {
	LoudsSparse* louds_sparse = new LoudsSparse();
	memcpy(&(louds_sparse->height_), src, sizeof(louds_sparse->height_));
	src += sizeof(louds_sparse->height_);
	memcpy(&(louds_sparse->start_level_), src, sizeof(louds_sparse->start_level_));
	src += sizeof(louds_sparse->start_level_);
	memcpy(&(louds_sparse->node_count_dense_), src, sizeof(louds_sparse->node_count_dense_));
	src += sizeof(louds_sparse->node_count_dense_);
	memcpy(&(louds_sparse->child_count_dense_), src, sizeof(louds_sparse->child_count_dense_));
	src += sizeof(louds_sparse->child_count_dense_);
	louds_sparse->level_cuts_ = new position_t[louds_sparse->height_];
	memcpy(louds_sparse->level_cuts_, src,
	       sizeof(position_t) * (louds_sparse->height_));
	src += (sizeof(position_t) * (louds_sparse->height_));
	align(src);
	louds_sparse->labels_ = LabelVector::deSerialize(src);
	louds_sparse->child_indicator_bits_ = BitvectorRank::deSerialize(src);
	louds_sparse->louds_bits_ = BitvectorSelect::deSerialize(src);
	louds_sparse->suffixes_ = BitvectorSuffix::deSerialize(src);
	align(src);
	return louds_sparse;
    }

    void destroy() {
	delete[] level_cuts_;
	labels_->destroy();
	child_indicator_bits_->destroy();
	louds_bits_->destroy();
	suffixes_->destroy();
    }

private:
    position_t getChildNodeNum(const position_t pos) const;
    position_t getFirstLabelPos(const position_t node_num) const;
    position_t getLastLabelPos(const position_t node_num) const;
    position_t getSuffixPos(const position_t pos) const;
    position_t nodeSize(const position_t pos) const;
    bool isEndofNode(const position_t pos) const;

    void moveToLeftInNextSubtrie(position_t pos, const position_t node_size, 
				 const label_t label, LoudsSparse::Iter& iter) const;
    // return value indicates potential false positive
    bool compareSuffixGreaterThan(const position_t pos, const std::string& key, 
				  const level_t level, const bool inclusive, 
				  LoudsSparse::Iter& iter) const;

    position_t appendToPosList(std::vector<position_t>& pos_list,
			       const position_t node_num, const level_t level,
			       const bool isLeft, bool& done) const;
    void extendPosList(std::vector<position_t>& left_pos_list,
		       std::vector<position_t>& right_pos_list,
		       const position_t left_in_node_num,
		       const position_t right_in_node_num) const;

private:
    static const position_t kRankBasicBlockSize = 512;
    static const position_t kSelectSampleInterval = 64;

    level_t height_; // trie height
    level_t start_level_; // louds-sparse encoding starts at this level
    // number of nodes in louds-dense encoding
    position_t node_count_dense_;
    // number of children(1's in child indicator bitmap) in louds-dense encoding
    position_t child_count_dense_;
    position_t* level_cuts_; // position of the last bit at each level

    LabelVector* labels_;
    BitvectorRank* child_indicator_bits_;
    BitvectorSelect* louds_bits_;
    BitvectorSuffix* suffixes_;
};


LoudsSparse::LoudsSparse(const SuRFBuilder* builder) {
    height_ = builder->getLabels().size();
    start_level_ = builder->getSparseStartLevel();

    node_count_dense_ = 0;
    for (level_t level = 0; level < start_level_; level++)
	node_count_dense_ += builder->getNodeCounts()[level];

    if (start_level_ == 0)
	child_count_dense_ = 0;
    else
	child_count_dense_ = node_count_dense_ + builder->getNodeCounts()[start_level_] - 1;

    labels_ = new LabelVector(builder->getLabels(), start_level_, height_);

    std::vector<position_t> num_items_per_level;
    for (level_t level = 0; level < height_; level++)
	num_items_per_level.push_back(builder->getLabels()[level].size());

    level_cuts_ = new position_t[height_];
    for (level_t level = 0; level < start_level_; level++) {
	level_cuts_[level] = 0;
    }
    position_t bit_count = 0;
    for (level_t level = start_level_; level < height_; level++) {
	bit_count += num_items_per_level[level];
	level_cuts_[level] = bit_count - 1;
    }

    child_indicator_bits_ = new BitvectorRank(kRankBasicBlockSize, builder->getChildIndicatorBits(), 
					      num_items_per_level, start_level_, height_);
    louds_bits_ = new BitvectorSelect(kSelectSampleInterval, builder->getLoudsBits(), 
				      num_items_per_level, start_level_, height_);

    if (builder->getSuffixType() == kNone) {
	suffixes_ = new BitvectorSuffix();
    } else {
	level_t hash_suffix_len = builder->getHashSuffixLen();
        level_t real_suffix_len = builder->getRealSuffixLen();
        level_t suffix_len = hash_suffix_len + real_suffix_len;
	std::vector<position_t> num_suffix_bits_per_level;
	for (level_t level = 0; level < height_; level++)
	    num_suffix_bits_per_level.push_back(builder->getSuffixCounts()[level] * suffix_len);

	suffixes_ = new BitvectorSuffix(builder->getSuffixType(), hash_suffix_len, real_suffix_len,
                                        builder->getSuffixes(),
					num_suffix_bits_per_level, start_level_, height_);
    }
}

bool LoudsSparse::lookupKey(const std::string& key, const position_t in_node_num) const {
    position_t node_num = in_node_num;
    position_t pos = getFirstLabelPos(node_num);
    level_t level = 0;
    for (level = start_level_; level < key.length(); level++) {
	//child_indicator_bits_->prefetch(pos);
	if (!labels_->search((label_t)key[level], pos, nodeSize(pos)))
	    return false;

	// if trie branch terminates
	if (!child_indicator_bits_->readBit(pos))
	    return suffixes_->checkEquality(getSuffixPos(pos), key, level + 1);

	// move to child
	node_num = getChildNodeNum(pos);
	pos = getFirstLabelPos(node_num);
    }
    if ((labels_->read(pos) == kTerminator) && (!child_indicator_bits_->readBit(pos)))
	return suffixes_->checkEquality(getSuffixPos(pos), key, level + 1);
    return false;
}

bool LoudsSparse::moveToKeyGreaterThan(const std::string& key, 
				       const bool inclusive, LoudsSparse::Iter& iter) const {
    position_t node_num = iter.getStartNodeNum();
    position_t pos = getFirstLabelPos(node_num);

    level_t level;
    for (level = start_level_; level < key.length(); level++) {
	position_t node_size = nodeSize(pos);
	// if no exact match
	if (!labels_->search((label_t)key[level], pos, node_size)) {
	    moveToLeftInNextSubtrie(pos, node_size, key[level], iter);
	    return false;
	}

	iter.append(key[level], pos);

	// if trie branch terminates
	if (!child_indicator_bits_->readBit(pos))
	    return compareSuffixGreaterThan(pos, key, level+1, inclusive, iter);

	// move to child
	node_num = getChildNodeNum(pos);
	pos = getFirstLabelPos(node_num);
    }

    if ((labels_->read(pos) == kTerminator)
	&& (!child_indicator_bits_->readBit(pos))
	&& !isEndofNode(pos)) {
	iter.append(kTerminator, pos);
	iter.is_at_terminator_ = true;
	if (!inclusive)
	    iter++;
	iter.is_valid_ = true;
	return false;
    }

    if (key.length() <= level) {
	iter.moveToLeftMostKey();
	return false;
    }

    iter.is_valid_ = true;
    return true;
}

position_t LoudsSparse::appendToPosList(std::vector<position_t>& pos_list,
					const position_t node_num,
					const level_t level,
					const bool isLeft, bool& done) const {
    position_t pos = getFirstLabelPos(node_num);
    if (pos > level_cuts_[start_level_ + level]) {
	pos = kMaxPos;
	if (isLeft) {
	    pos_list.push_back(pos);
	} else {
	    for (level_t j = 0; j < (height_ - level) - 1; j++)
		pos_list.push_back(pos);
	}
	done = true;
    }
    pos_list.push_back(pos);
    return pos;
}

void LoudsSparse::extendPosList(std::vector<position_t>& left_pos_list,
				std::vector<position_t>& right_pos_list,
				const position_t left_in_node_num,
				const position_t right_in_node_num) const {
    position_t left_node_num = 0, right_node_num = 0, left_pos = 0, right_pos = 0;
    bool left_done = false, right_done = false;
    level_t start_depth = left_pos_list.size();
    if (start_depth > right_pos_list.size())
	start_depth = right_pos_list.size();
    if (start_depth == 0) {
	if (left_pos_list.size() == 0)
	    left_pos = appendToPosList(left_pos_list, left_in_node_num,
				       0, true, left_done);
	if (right_pos_list.size() == 0)
	    right_pos = appendToPosList(right_pos_list, right_in_node_num,
					0, false, right_done);
	start_depth++;
    }

    left_pos = left_pos_list[left_pos_list.size() - 1];
    right_pos = right_pos_list[right_pos_list.size() - 1];
    for (level_t i = start_depth; i < (height_ - start_level_); i++) {
	if (left_pos == right_pos) break;
	if (!left_done && left_pos_list.size() <= i) {
	    left_node_num = getChildNodeNum(left_pos);
	    if (!child_indicator_bits_->readBit(left_pos))
		left_node_num++;
	    left_pos = appendToPosList(left_pos_list, left_node_num,
				       i, true, left_done);
	}
	if (!right_done && right_pos_list.size() <= i) {
	    right_node_num = getChildNodeNum(right_pos);
	    if (!child_indicator_bits_->readBit(right_pos))
		right_node_num++;
	    right_pos = appendToPosList(right_pos_list, right_node_num,
					i, false, right_done);
	}
    }
}

uint64_t LoudsSparse::approxCount(const LoudsSparse::Iter* iter_left,
				  const LoudsSparse::Iter* iter_right,
				  const position_t in_node_num_left,
				  const position_t in_node_num_right) const {
    if (in_node_num_left == kMaxPos) return 0;
    std::vector<position_t> left_pos_list, right_pos_list;
    for (level_t i = 0; i < iter_left->key_len_; i++)
	left_pos_list.push_back(iter_left->pos_in_trie_[i]);
    level_t ori_left_len = left_pos_list.size();
    if (in_node_num_right == kMaxPos) {
	for (level_t i = 0; i < (height_ - start_level_); i++)
	    right_pos_list.push_back(kMaxPos);
    } else {
	for (level_t i = 0; i < iter_right->key_len_; i++)
	    right_pos_list.push_back(iter_right->pos_in_trie_[i]);
    }
    extendPosList(left_pos_list, right_pos_list, in_node_num_left, in_node_num_right);

    uint64_t count = 0;
    level_t search_depth = left_pos_list.size();
    if (search_depth > right_pos_list.size())
	search_depth = right_pos_list.size();
    for (level_t i = 0; i < search_depth; i++) {
	position_t left_pos = left_pos_list[i];
	if (left_pos == kMaxPos) break;
	position_t right_pos = right_pos_list[i];
	if (right_pos == kMaxPos)
	    right_pos = level_cuts_[start_level_ + i] + 1;
	//assert(left_pos <= right_pos);
	if (left_pos < right_pos) {
	    position_t rank_left = child_indicator_bits_->rank(left_pos);
	    position_t rank_right = child_indicator_bits_->rank(right_pos);
	    position_t num_leafs = (right_pos - left_pos) - (rank_right - rank_left);
	    if (child_indicator_bits_->readBit(right_pos))
		num_leafs++;
	    if (child_indicator_bits_->readBit(left_pos))
		num_leafs--;
	    if (i == ori_left_len - 1)
		num_leafs--;
	    count += num_leafs;
	}
    }
    return count;
}

uint64_t LoudsSparse::serializedSize() const {
    uint64_t size = sizeof(height_) + sizeof(start_level_)
	+ sizeof(node_count_dense_) + sizeof(child_count_dense_)
	+ (sizeof(position_t) * height_);
    sizeAlign(size);
    size += (labels_->serializedSize()
	     + child_indicator_bits_->serializedSize()
	     + louds_bits_->serializedSize()
	     + suffixes_->serializedSize());
    sizeAlign(size);
    return size;
}

uint64_t LoudsSparse::getMemoryUsage() const {
    return (sizeof(this)
	    + labels_->size()
	    + child_indicator_bits_->size()
	    + louds_bits_->size()
	    + suffixes_->size());
}

position_t LoudsSparse::getChildNodeNum(const position_t pos) const {
    return (child_indicator_bits_->rank(pos) + child_count_dense_);
}

position_t LoudsSparse::getFirstLabelPos(const position_t node_num) const {
    return louds_bits_->select(node_num + 1 - node_count_dense_);
}

position_t LoudsSparse::getLastLabelPos(const position_t node_num) const {
    position_t next_rank = node_num + 2 - node_count_dense_;
    if (next_rank > louds_bits_->numOnes())
	return (louds_bits_->numBits() - 1);
    return (louds_bits_->select(next_rank) - 1);
}

position_t LoudsSparse::getSuffixPos(const position_t pos) const {
    return (pos - child_indicator_bits_->rank(pos));
}

position_t LoudsSparse::nodeSize(const position_t pos) const {
    assert(louds_bits_->readBit(pos));
    return louds_bits_->distanceToNextSetBit(pos);
}

bool LoudsSparse::isEndofNode(const position_t pos) const {
    return ((pos == louds_bits_->numBits() - 1)
	    || louds_bits_->readBit(pos + 1));
}

void LoudsSparse::moveToLeftInNextSubtrie(position_t pos, const position_t node_size, 
					  const label_t label, LoudsSparse::Iter& iter) const {
    // if no label is greater than key[level] in this node
    if (!labels_->searchGreaterThan(label, pos, node_size)) {
	iter.append(pos + node_size - 1);
	return iter++;
    } else {
	iter.append(pos);
	return iter.moveToLeftMostKey();
    }
}

bool LoudsSparse::compareSuffixGreaterThan(const position_t pos, const std::string& key, 
					   const level_t level, const bool inclusive, 
					   LoudsSparse::Iter& iter) const {
    position_t suffix_pos = getSuffixPos(pos);
    int compare = suffixes_->compare(suffix_pos, key, level);
    if ((compare != kCouldBePositive) && (compare < 0)) {
	iter++;
	return false;
    }
    iter.is_valid_ = true;
    return true;
}

//============================================================================

void LoudsSparse::Iter::clear() {
    is_valid_ = false;
    key_len_ = 0;
    is_at_terminator_ = false;
}

int LoudsSparse::Iter::compare(const std::string& key) const {
    if (is_at_terminator_ && (key_len_ - 1) < (key.length() - start_level_))
	return -1;
    std::string iter_key = getKey();
    std::string key_sparse = key.substr(start_level_);
    std::string key_sparse_same_length = key_sparse.substr(0, iter_key.length());
    int compare = iter_key.compare(key_sparse_same_length);
    if (compare != 0) 
	return compare;
    position_t suffix_pos = trie_->getSuffixPos(pos_in_trie_[key_len_ - 1]);
    return trie_->suffixes_->compare(suffix_pos, key_sparse, key_len_);
}

std::string LoudsSparse::Iter::getKey() const {
    if (!is_valid_)
	return std::string();
    level_t len = key_len_;
    if (is_at_terminator_)
	len--;
    return std::string((const char*)key_.data(), (size_t)len);
}

int LoudsSparse::Iter::getSuffix(word_t* suffix) const {
    if ((trie_->suffixes_->getType() == kReal) || (trie_->suffixes_->getType() == kMixed)) {
	position_t suffix_pos = trie_->getSuffixPos(pos_in_trie_[key_len_ - 1]);
	*suffix = trie_->suffixes_->readReal(suffix_pos);
	return trie_->suffixes_->getRealSuffixLen();
    }
    *suffix = 0;
    return 0;
}

std::string LoudsSparse::Iter::getKeyWithSuffix(unsigned* bitlen) const {
    std::string iter_key = getKey();
    if ((trie_->suffixes_->getType() == kReal) || (trie_->suffixes_->getType() == kMixed)) {
	position_t suffix_pos = trie_->getSuffixPos(pos_in_trie_[key_len_ - 1]);
	word_t suffix = trie_->suffixes_->readReal(suffix_pos);
	if (suffix > 0) {
	    level_t suffix_len = trie_->suffixes_->getRealSuffixLen();
	    *bitlen = suffix_len % 8;
	    suffix <<= (64 - suffix_len);
	    char* suffix_str = reinterpret_cast<char*>(&suffix);
	    suffix_str += 7;
	    unsigned pos = 0;
	    while (pos < suffix_len) {
		iter_key.append(suffix_str, 1);
		suffix_str--;
		pos += 8;
	    }
	}
    }
    return iter_key;
}

void LoudsSparse::Iter::append(const position_t pos) {
    assert(key_len_ < key_.size());
    key_[key_len_] = trie_->labels_->read(pos);
    pos_in_trie_[key_len_] = pos;
    key_len_++;
}

void LoudsSparse::Iter::append(const label_t label, const position_t pos) {
    assert(key_len_ < key_.size());
    key_[key_len_] = label;
    pos_in_trie_[key_len_] = pos;
    key_len_++;
}

void LoudsSparse::Iter::set(const level_t level, const position_t pos) {
    assert(level < key_.size());
    key_[level] = trie_->labels_->read(pos);
    pos_in_trie_[level] = pos;
}

void LoudsSparse::Iter::setToFirstLabelInRoot() {
    assert(start_level_ == 0);
    pos_in_trie_[0] = 0;
    key_[0] = trie_->labels_->read(0);
}

void LoudsSparse::Iter::setToLastLabelInRoot() {
    assert(start_level_ == 0);
    pos_in_trie_[0] = trie_->getLastLabelPos(0);
    key_[0] = trie_->labels_->read(pos_in_trie_[0]);
}

void LoudsSparse::Iter::moveToLeftMostKey() {
    if (key_len_ == 0) {
	position_t pos = trie_->getFirstLabelPos(start_node_num_);
	label_t label = trie_->labels_->read(pos);
	append(label, pos);
    }

    level_t level = key_len_ - 1;
    position_t pos = pos_in_trie_[level];
    label_t label = trie_->labels_->read(pos);

    if (!trie_->child_indicator_bits_->readBit(pos)) {
	if ((label == kTerminator)
	    && !trie_->isEndofNode(pos))
	    is_at_terminator_ = true;
	is_valid_ = true;
	return;
    }

    while (level < trie_->getHeight()) {
	position_t node_num = trie_->getChildNodeNum(pos);
	pos = trie_->getFirstLabelPos(node_num);
	label = trie_->labels_->read(pos);
	// if trie branch terminates
	if (!trie_->child_indicator_bits_->readBit(pos)) {
	    append(label, pos);
	    if ((label == kTerminator)
		&& !trie_->isEndofNode(pos))
		is_at_terminator_ = true;
	    is_valid_ = true;
	    return;
	}
	append(label, pos);
	level++;
    }
    assert(false); // shouldn't reach here
}

void LoudsSparse::Iter::moveToRightMostKey() {
    if (key_len_ == 0) {
	position_t pos = trie_->getFirstLabelPos(start_node_num_);
	pos = trie_->getLastLabelPos(start_node_num_);
	label_t label = trie_->labels_->read(pos);
	append(label, pos);
    }

    level_t level = key_len_ - 1;
    position_t pos = pos_in_trie_[level];
    label_t label = trie_->labels_->read(pos);

    if (!trie_->child_indicator_bits_->readBit(pos)) {
	if ((label == kTerminator)
	    && !trie_->isEndofNode(pos))
	    is_at_terminator_ = true;
	is_valid_ = true;
	return;
    }

    while (level < trie_->getHeight()) {
	position_t node_num = trie_->getChildNodeNum(pos);
	pos = trie_->getLastLabelPos(node_num);
	label = trie_->labels_->read(pos);
	// if trie branch terminates
	if (!trie_->child_indicator_bits_->readBit(pos)) {
	    append(label, pos);
	    if ((label == kTerminator)
		&& !trie_->isEndofNode(pos))
		is_at_terminator_ = true;
	    is_valid_ = true;
	    return;
	}
	append(label, pos);
	level++;
    }
    assert(false); // shouldn't reach here
}

void LoudsSparse::Iter::operator ++(int) {
    assert(key_len_ > 0);
    is_at_terminator_ = false;
    position_t pos = pos_in_trie_[key_len_ - 1];
    pos++;
    while (pos >= trie_->louds_bits_->numBits() || trie_->louds_bits_->readBit(pos)) {
	key_len_--;
	if (key_len_ == 0) {
	    is_valid_ = false;
	    return;
	}
	pos = pos_in_trie_[key_len_ - 1];
	pos++;
    }
    set(key_len_ - 1, pos);
    return moveToLeftMostKey();
}

void LoudsSparse::Iter::operator --(int) {
    assert(key_len_ > 0);
    is_at_terminator_ = false;
    position_t pos = pos_in_trie_[key_len_ - 1];
    if (pos == 0) {
	is_valid_ = false;
	return;
    }
    while (trie_->louds_bits_->readBit(pos)) {
	key_len_--;
	if (key_len_ == 0) {
	    is_valid_ = false;
	    return;
	}
	pos = pos_in_trie_[key_len_ - 1];
    }
    pos--;
    set(key_len_ - 1, pos);
    return moveToRightMostKey();
}

} // namespace surf

#endif // LOUDSSPARSE_H_


================================================
FILE: include/popcount.h
================================================
/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ifndef _FASTRANK_POPCOUNT_H_
#define _FASTRANK_POPCOUNT_H_

#include <sys/types.h>
#include <stdio.h>
#include <stdint.h>

namespace surf {

#define L8 0x0101010101010101ULL // Every lowest 8th bit set: 00000001...
#define G2 0xAAAAAAAAAAAAAAAAULL // Every highest 2nd bit: 101010...
#define G4 0x3333333333333333ULL // 00110011 ... used to group the sum of 4 bits.
#define G8 0x0F0F0F0F0F0F0F0FULL
#define H8 0x8080808080808080ULL 
#define L9 0x0040201008040201ULL
#define H9 (L9 << 8)
#define L16 0x0001000100010001ULL
#define H16 0x8000800080008000ULL

#define ONES_STEP_4 ( 0x1111111111111111ULL )
#define ONES_STEP_8 ( 0x0101010101010101ULL )
#define ONES_STEP_9 ( 1ULL << 0 | 1ULL << 9 | 1ULL << 18 | 1ULL << 27 | 1ULL << 36 | 1ULL << 45 | 1ULL << 54 )
#define ONES_STEP_16 ( 1ULL << 0 | 1ULL << 16 | 1ULL << 32 | 1ULL << 48 )
#define MSBS_STEP_4 ( 0x8ULL * ONES_STEP_4 )
#define MSBS_STEP_8 ( 0x80ULL * ONES_STEP_8 )
#define MSBS_STEP_9 ( 0x100ULL * ONES_STEP_9 )
#define MSBS_STEP_16 ( 0x8000ULL * ONES_STEP_16 )
#define INCR_STEP_8 ( 0x80ULL << 56 | 0x40ULL << 48 | 0x20ULL << 40 | 0x10ULL << 32 | 0x8ULL << 24 | 0x4ULL << 16 | 0x2ULL << 8 | 0x1 )

#define ONES_STEP_32 ( 0x0000000100000001ULL )
#define MSBS_STEP_32 ( 0x8000000080000000ULL )
	
#define COMPARE_STEP_8(x,y) ( ( ( ( ( (x) | MSBS_STEP_8 ) - ( (y) & ~MSBS_STEP_8 ) ) ^ (x) ^ ~(y) ) & MSBS_STEP_8 ) >> 7 )
#define LEQ_STEP_8(x,y) ( ( ( ( ( (y) | MSBS_STEP_8 ) - ( (x) & ~MSBS_STEP_8 ) ) ^ (x) ^ (y) ) & MSBS_STEP_8 ) >> 7 )

#define UCOMPARE_STEP_9(x,y) ( ( ( ( ( ( (x) | MSBS_STEP_9 ) - ( (y) & ~MSBS_STEP_9 ) ) | ( x ^ y ) ) ^ ( x | ~y ) ) & MSBS_STEP_9 ) >> 8 )
#define UCOMPARE_STEP_16(x,y) ( ( ( ( ( ( (x) | MSBS_STEP_16 ) - ( (y) & ~MSBS_STEP_16 ) ) | ( x ^ y ) ) ^ ( x | ~y ) ) & MSBS_STEP_16 ) >> 15 )
#define ULEQ_STEP_9(x,y) ( ( ( ( ( ( (y) | MSBS_STEP_9 ) - ( (x) & ~MSBS_STEP_9 ) ) | ( x ^ y ) ) ^ ( x & ~y ) ) & MSBS_STEP_9 ) >> 8 )
#define ULEQ_STEP_16(x,y) ( ( ( ( ( ( (y) | MSBS_STEP_16 ) - ( (x) & ~MSBS_STEP_16 ) ) | ( x ^ y ) ) ^ ( x & ~y ) ) & MSBS_STEP_16 ) >> 15 )
#define ZCOMPARE_STEP_8(x) ( ( ( x | ( ( x | MSBS_STEP_8 ) - ONES_STEP_8 ) ) & MSBS_STEP_8 ) >> 7 )

// Population count of a 64 bit integer in SWAR (SIMD within a register) style
// From Sebastiano Vigna, "Broadword Implementation of Rank/Select Queries"
// http://sux.dsi.unimi.it/paper.pdf p4
// This variant uses multiplication for the last summation instead of
// continuing the shift/mask/addition chain.
inline int suxpopcount(uint64_t x) {
    // Step 1:  00 - 00 = 0;  01 - 00 = 01; 10 - 01 = 01; 11 - 01 = 10;
    x = x - ((x & G2) >> 1);
    // step 2:  add 2 groups of 2.
    x = (x & G4) + ((x >> 2) & G4);
    // 2 groups of 4.
    x = (x + (x >> 4)) & G8;
    // Using a multiply to collect the 8 groups of 8 together.
    x = x * L8 >> 56;
    return x;
}

// Default to using the GCC builtin popcount.  On architectures
// with -march popcnt, this compiles to a single popcnt instruction.
#ifndef popcount
#define popcount __builtin_popcountll
//#define popcount suxpopcount
#endif

#define popcountsize 64ULL
#define popcountmask (popcountsize - 1)

inline uint64_t popcountLinear(uint64_t *bits, uint64_t x, uint64_t nbits) {
    if (nbits == 0) { return 0; }
    uint64_t lastword = (nbits - 1) / popcountsize;
    uint64_t p = 0;

    __builtin_prefetch(bits + x + 7, 0); //huanchen
    for (uint64_t i = 0; i < lastword; i++) { /* tested;  manually unrolling doesn't help, at least in C */
        //__builtin_prefetch(bits + x + i + 3, 0);
        p += popcount(bits[x+i]); // note that use binds us to 64 bit popcount impls
    }

    // 'nbits' may or may not fall on a multiple of 64 boundary,
    // so we may need to zero out the right side of the last word
    // (accomplished by shifting it right, since we're just popcounting)
    uint64_t lastshifted = bits[x+lastword] >> (63 - ((nbits - 1) & popcountmask));
    p += popcount(lastshifted);
    return p;
}

// Return the index of the kth bit set in x 
inline int select64_naive(uint64_t x, int k) {
    int count = -1;
    for (int i = 63; i >= 0; i--) {
        count++;
        if (x & (1ULL << i)) {
            k--;
            if (k == 0) {
                return count;
            }
        }
    }
    return -1;
}

inline int select64_popcount_search(uint64_t x, int k) {
    int loc = -1;
    // if (k > popcount(x)) { return -1; }

    for (int testbits = 32; testbits > 0; testbits >>= 1) {
        int lcount = popcount(x >> testbits);
        if (k > lcount) {
            x &= ((1ULL << testbits)-1);
            loc += testbits;
            k -= lcount;
        } else {
            x >>= testbits;
        }
    }
    return loc+k;
}

inline int select64_broadword(uint64_t x, int k) {
    uint64_t word = x;
    int residual = k;
    register uint64_t byte_sums;
    
    byte_sums = word - ( ( word & 0xa * ONES_STEP_4 ) >> 1 );
    byte_sums = ( byte_sums & 3 * ONES_STEP_4 ) + ( ( byte_sums >> 2 ) & 3 * ONES_STEP_4 );
    byte_sums = ( byte_sums + ( byte_sums >> 4 ) ) & 0x0f * ONES_STEP_8;
    byte_sums *= ONES_STEP_8;
    
    // Phase 2: compare each byte sum with the residual
    const uint64_t residual_step_8 = residual * ONES_STEP_8;
    const int place = ( LEQ_STEP_8( byte_sums, residual_step_8 ) * ONES_STEP_8 >> 53 ) & ~0x7;
    
    // Phase 3: Locate the relevant byte and make 8 copies with incremental masks
    const int byte_rank = residual - ( ( ( byte_sums << 8 ) >> place ) & 0xFF );
    
    const uint64_t spread_bits = ( word >> place & 0xFF ) * ONES_STEP_8 & INCR_STEP_8;
    const uint64_t bit_sums = ZCOMPARE_STEP_8( spread_bits ) * ONES_STEP_8;
    
    // Compute the inside-byte location and return the sum
    const uint64_t byte_rank_step_8 = byte_rank * ONES_STEP_8;
    
    return place + ( LEQ_STEP_8( bit_sums, byte_rank_step_8 ) * ONES_STEP_8 >> 56 );   
}

inline int select64(uint64_t x, int k) {
    return select64_popcount_search(x, k);
}

// x is the starting offset of the 512 bits;
// k is the thing we're selecting for.
inline int select512(uint64_t *bits, int x, int k) {
    __asm__ __volatile__ (
                          "prefetchnta (%0)\n"
                          : : "r" (&bits[x]) );
    int i = 0;
    int pop = popcount(bits[x+i]);
    while (k > pop && i < 7) {
        k -= pop;
        i++;
        pop = popcount(bits[x+i]);
    }
    if (i == 7 && popcount(bits[x+i]) < k) {
        return -1;
    }
    // We're now certain that the bit we want is stored in bv[x+i]
    return i*64 + select64(bits[x+i], k);
}

// brute-force linear select
// x is the starting offset of the bits in bv;
// k is the thing we're selecting for (starting from bv[x]).
// bvlen is the total length of bv
inline uint64_t selectLinear(uint64_t* bits, uint64_t length, uint64_t x, uint64_t k) {
    if (k > (length - x) * 64)
        return -1;
    uint64_t i = 0;
    uint64_t pop = popcount(bits[x+i]);
    while (k > pop && i < (length - 1)) {
        k -= pop;
        i++;
        pop = popcount(bits[x+i]);
    }
    if ((i == length - 1) && (pop < k)) {
        return -1;
    }
    // We're now certain that the bit we want is stored in bits[x+i]
    return i*64 + select64(bits[x+i], k);
}

} // namespace surf

#endif /* _FASTRANK_POPCOUNT_H_ */


================================================
FILE: include/rank.hpp
================================================
#ifndef RANK_H_
#define RANK_H_

#include "bitvector.hpp"

#include <assert.h>

#include <vector>

#include "popcount.h"

namespace surf {

class BitvectorRank : public Bitvector {
public:
    BitvectorRank() : basic_block_size_(0), rank_lut_(nullptr) {};

    BitvectorRank(const position_t basic_block_size, 
		  const std::vector<std::vector<word_t> >& bitvector_per_level, 
		  const std::vector<position_t>& num_bits_per_level,
		  const level_t start_level = 0,
		  const level_t end_level = 0/* non-inclusive */) 
	: Bitvector(bitvector_per_level, num_bits_per_level, start_level, end_level) {
	basic_block_size_ = basic_block_size;
	initRankLut();
    }

    ~BitvectorRank() {}

    // Counts the number of 1's in the bitvector up to position pos.
    // pos is zero-based; count is one-based.
    // E.g., for bitvector: 100101000, rank(3) = 2
    position_t rank(position_t pos) const {
        assert(pos <= num_bits_);
        position_t word_per_basic_block = basic_block_size_ / kWordSize;
        position_t block_id = pos / basic_block_size_;
        position_t offset = pos & (basic_block_size_ - 1);
        return (rank_lut_[block_id] 
		+ popcountLinear(bits_, block_id * word_per_basic_block, offset + 1));
    }

    position_t rankLutSize() const {
	return ((num_bits_ / basic_block_size_ + 1) * sizeof(position_t));
    }

    position_t serializedSize() const {
	position_t size = sizeof(num_bits_) + sizeof(basic_block_size_) 
	    + bitsSize() + rankLutSize();
	sizeAlign(size);
	return size;
    }

    position_t size() const {
	return (sizeof(BitvectorRank) + bitsSize() + rankLutSize());
    }

    void prefetch(position_t pos) const {
	__builtin_prefetch(bits_ + (pos / kWordSize));
	__builtin_prefetch(rank_lut_ + (pos / basic_block_size_));
    }

    void serialize(char*& dst) const {
	memcpy(dst, &num_bits_, sizeof(num_bits_));
	dst += sizeof(num_bits_);
	memcpy(dst, &basic_block_size_, sizeof(basic_block_size_));
	dst += sizeof(basic_block_size_);
	memcpy(dst, bits_, bitsSize());
	dst += bitsSize();
	memcpy(dst, rank_lut_, rankLutSize());
	dst += rankLutSize();
	align(dst);
    }

    static BitvectorRank* deSerialize(char*& src) {
	BitvectorRank* bv_rank = new BitvectorRank();
	memcpy(&(bv_rank->num_bits_), src, sizeof(bv_rank->num_bits_));
	src += sizeof(bv_rank->num_bits_);
	memcpy(&(bv_rank->basic_block_size_), src, sizeof(bv_rank->basic_block_size_));
	src += sizeof(bv_rank->basic_block_size_);

	bv_rank->bits_ = new word_t[bv_rank->numWords()];
	memcpy(bv_rank->bits_, src, bv_rank->bitsSize());
	src += bv_rank->bitsSize();
	bv_rank->rank_lut_ = new position_t[bv_rank->rankLutSize() / sizeof(position_t)];
	memcpy(bv_rank->rank_lut_, src, bv_rank->rankLutSize());
	src += bv_rank->rankLutSize();
	
	//bv_rank->bits_ = const_cast<word_t*>(reinterpret_cast<const word_t*>(src));
	//src += bv_rank->bitsSize();
	//bv_rank->rank_lut_ = const_cast<position_t*>(reinterpret_cast<const position_t*>(src));
	//src += bv_rank->rankLutSize();
	
	align(src);
	return bv_rank;
    }

    void destroy() {
	delete[] bits_;
	delete[] rank_lut_;
    }

private:
    void initRankLut() {
        position_t word_per_basic_block = basic_block_size_ / kWordSize;
        position_t num_blocks = num_bits_ / basic_block_size_ + 1;
	rank_lut_ = new position_t[num_blocks];

        position_t cumu_rank = 0;
        for (position_t i = 0; i < num_blocks - 1; i++) {
            rank_lut_[i] = cumu_rank;
            cumu_rank += popcountLinear(bits_, i * word_per_basic_block, basic_block_size_);
        }
	rank_lut_[num_blocks - 1] = cumu_rank;
    }

    position_t basic_block_size_;
    position_t* rank_lut_; //rank look-up table
};

} // namespace surf

#endif // RANK_H_


================================================
FILE: include/select.hpp
================================================
#ifndef SELECT_H_
#define SELECT_H_

#include "bitvector.hpp"

#include <assert.h>

#include <vector>

#include "config.hpp"
#include "popcount.h"

namespace surf {

class BitvectorSelect : public Bitvector {
public:
    BitvectorSelect() : sample_interval_(0), num_ones_(0), select_lut_(nullptr) {};

    BitvectorSelect(const position_t sample_interval, 
		    const std::vector<std::vector<word_t> >& bitvector_per_level, 
		    const std::vector<position_t>& num_bits_per_level,
		    const level_t start_level = 0,
		    const level_t end_level = 0/* non-inclusive */) 
	: Bitvector(bitvector_per_level, num_bits_per_level, start_level, end_level) {
	sample_interval_ = sample_interval;
	initSelectLut();
    }

    ~BitvectorSelect() {}

    // Returns the postion of the rank-th 1 bit.
    // posistion is zero-based; rank is one-based.
    // E.g., for bitvector: 100101000, select(3) = 5
    position_t select(position_t rank) const {
	assert(rank > 0);
	assert(rank <= num_ones_ + 1);
	position_t lut_idx = rank / sample_interval_;
	position_t rank_left = rank % sample_interval_;
	// The first slot in select_lut_ stores the position of the first 1 bit.
	// Slot i > 0 stores the position of (i * sample_interval_)-th 1 bit
	if (lut_idx == 0)
	    rank_left--;

	position_t pos = select_lut_[lut_idx];

	if (rank_left == 0)
	    return pos;

	position_t word_id = pos / kWordSize;
	position_t offset = pos % kWordSize;
	if (offset == kWordSize - 1) {
	    word_id++;
	    offset = 0;
	} else {
	    offset++;
	}
	word_t word = bits_[word_id] << offset >> offset; //zero-out most significant bits
	position_t ones_count_in_word = popcount(word);
	while (ones_count_in_word < rank_left) {
	    word_id++;
	    word = bits_[word_id];
	    rank_left -= ones_count_in_word;
	    ones_count_in_word = popcount(word);
	}
	return (word_id * kWordSize + select64_popcount_search(word, rank_left));
    }

    position_t selectLutSize() const {
	return ((num_ones_ / sample_interval_ + 1) * sizeof(position_t));
    }

    position_t serializedSize() const {
	position_t size = sizeof(num_bits_) + sizeof(sample_interval_) + sizeof(num_ones_)
	    + bitsSize() + selectLutSize();
	sizeAlign(size);
	return size;
    }

    position_t size() const {
	return (sizeof(BitvectorSelect) + bitsSize() + selectLutSize());
    }

    position_t numOnes() const {
	return num_ones_;
    }

    void serialize(char*& dst) const {
	memcpy(dst, &num_bits_, sizeof(num_bits_));
	dst += sizeof(num_bits_);
	memcpy(dst, &sample_interval_, sizeof(sample_interval_));
	dst += sizeof(sample_interval_);
	memcpy(dst, &num_ones_, sizeof(num_ones_));
	dst += sizeof(num_ones_);
	memcpy(dst, bits_, bitsSize());
	dst += bitsSize();
	memcpy(dst, select_lut_, selectLutSize());
	dst += selectLutSize();
	align(dst);
    }

    static BitvectorSelect* deSerialize(char*& src) {
	BitvectorSelect* bv_select = new BitvectorSelect();
	memcpy(&(bv_select->num_bits_), src, sizeof(bv_select->num_bits_));
	src += sizeof(bv_select->num_bits_);
	memcpy(&(bv_select->sample_interval_), src, sizeof(bv_select->sample_interval_));
	src += sizeof(bv_select->sample_interval_);
	memcpy(&(bv_select->num_ones_), src, sizeof(bv_select->num_ones_));
	src += sizeof(bv_select->num_ones_);

	bv_select->bits_ = new word_t[bv_select->numWords()];
	memcpy(bv_select->bits_, src, bv_select->bitsSize());
	src += bv_select->bitsSize();
	bv_select->select_lut_ = new position_t[bv_select->selectLutSize() / sizeof(position_t)];
	memcpy(bv_select->select_lut_, src, bv_select->selectLutSize());
	src += bv_select->selectLutSize();
	
	//bv_select->bits_ = const_cast<word_t*>(reinterpret_cast<const word_t*>(src));
	//src += bv_select->bitsSize();
	//bv_select->select_lut_ = const_cast<position_t*>(reinterpret_cast<const position_t*>(src));
	//src += bv_select->selectLutSize();
	align(src);
	return bv_select;
    }

    void destroy() {
	delete[] bits_;
	delete[] select_lut_;
    }

private:
    // This function currently assumes that the first bit in the
    // bitvector is one.
    void initSelectLut() {
	position_t num_words = num_bits_ / kWordSize;
	if (num_bits_ % kWordSize != 0)
	    num_words++;

	std::vector<position_t> select_lut_vector;
	select_lut_vector.push_back(0); //ASSERT: first bit is 1
	position_t sampling_ones = sample_interval_;
	position_t cumu_ones_upto_word = 0;
	for (position_t i = 0; i < num_words; i++) {
	    position_t num_ones_in_word = popcount(bits_[i]);
	    while (sampling_ones <= (cumu_ones_upto_word + num_ones_in_word)) {
		int diff = sampling_ones - cumu_ones_upto_word;
		position_t result_pos = i * kWordSize + select64_popcount_search(bits_[i], diff);
		select_lut_vector.push_back(result_pos);
		sampling_ones += sample_interval_;
	    }
	    cumu_ones_upto_word += popcount(bits_[i]);
	}

	num_ones_ = cumu_ones_upto_word;
	position_t num_samples = select_lut_vector.size();
	select_lut_ = new position_t[num_samples];
	for (position_t i = 0; i < num_samples; i++)
	    select_lut_[i] = select_lut_vector[i];
    }

private:
    position_t sample_interval_;
    position_t num_ones_;
    position_t* select_lut_; //select look-up table
};

} // namespace surf

#endif // SELECT_H_


================================================
FILE: include/suffix.hpp
================================================
#ifndef SUFFIX_H_
#define SUFFIX_H_

#include "bitvector.hpp"

#include <assert.h>

#include <vector>

#include "config.hpp"
#include "hash.hpp"

namespace surf {

// Max suffix_len_ = 64 bits
// For kReal suffixes, if the stored key is not long enough to provide
// suffix_len_ suffix bits, its suffix field is cleared (i.e., all 0's)
// to indicate that there is no suffix info associated with the key.
class BitvectorSuffix : public Bitvector {
public:
    BitvectorSuffix() : type_(kNone), hash_suffix_len_(0), real_suffix_len_(0) {};

    BitvectorSuffix(const SuffixType type,
                    const level_t hash_suffix_len, const level_t real_suffix_len,
                    const std::vector<std::vector<word_t> >& bitvector_per_level,
                    const std::vector<position_t>& num_bits_per_level,
                    const level_t start_level = 0,
                    level_t end_level = 0/* non-inclusive */)
	: Bitvector(bitvector_per_level, num_bits_per_level, start_level, end_level) {
	assert((hash_suffix_len + real_suffix_len) <= kWordSize);
	type_ = type;
	hash_suffix_len_ = hash_suffix_len;
        real_suffix_len_ = real_suffix_len;
    }

    static word_t constructHashSuffix(const std::string& key, const level_t len) {
	word_t suffix = suffixHash(key);
	suffix <<= (kWordSize - len - kHashShift);
	suffix >>= (kWordSize - len);
	return suffix;
    }

    static word_t constructRealSuffix(const std::string& key,
				      const level_t level, const level_t len) {
	if (key.length() < level || ((key.length() - level) * 8) < len)
	    return 0;
	word_t suffix = 0;
	level_t num_complete_bytes = len / 8;
	if (num_complete_bytes > 0) {
	    suffix += (word_t)(label_t)key[level];
	    for (position_t i = 1; i < num_complete_bytes; i++) {
		suffix <<= 8;
		suffix += (word_t)(uint8_t)key[level + i];
	    }
	}
	level_t offset = len % 8;
	if (offset > 0) {
	    suffix <<= offset;
	    word_t remaining_bits = 0;
	    remaining_bits = (word_t)(uint8_t)key[level + num_complete_bytes];
	    remaining_bits >>= (8 - offset);
	    suffix += remaining_bits;
	}
	return suffix;
    }

    static word_t constructMixedSuffix(const std::string& key, const level_t hash_len,
				       const level_t real_level, const level_t real_len) {
        word_t hash_suffix = constructHashSuffix(key, hash_len);
        word_t real_suffix = constructRealSuffix(key, real_level, real_len);
        word_t suffix = hash_suffix;
        suffix <<= real_len;
        suffix |= real_suffix;
        return suffix;
    }

    static word_t constructSuffix(const SuffixType type, const std::string& key,
                                  const level_t hash_len,
                                  const level_t real_level, const level_t real_len) {
	switch (type) {
	case kHash:
	    return constructHashSuffix(key, hash_len);
	case kReal:
	    return constructRealSuffix(key, real_level, real_len);
        case kMixed:
            return constructMixedSuffix(key, hash_len, real_level, real_len);
	default:
	    return 0;
        }
    }

    static word_t extractHashSuffix(const word_t suffix, const level_t real_suffix_len) {
        return (suffix >> real_suffix_len);
    }

    static word_t extractRealSuffix(const word_t suffix, const level_t real_suffix_len) {
        word_t real_suffix_mask = 1;
        real_suffix_mask <<= real_suffix_len;
        real_suffix_mask--;
        return (suffix & real_suffix_mask);
    }

    SuffixType getType() const {
	return type_;
    }

    level_t getSuffixLen() const {
	return hash_suffix_len_ + real_suffix_len_;
    }

    level_t getHashSuffixLen() const {
	return hash_suffix_len_;
    }

    level_t getRealSuffixLen() const {
	return real_suffix_len_;
    }

    position_t serializedSize() const {
	position_t size = sizeof(num_bits_) + sizeof(type_)
            + sizeof(hash_suffix_len_) + sizeof(real_suffix_len_) + bitsSize();
	sizeAlign(size);
	return size;
    }

    position_t size() const {
	return (sizeof(BitvectorSuffix) + bitsSize());
    }

    word_t read(const position_t idx) const;
    word_t readReal(const position_t idx) const;
    bool checkEquality(const position_t idx, const std::string& key, const level_t level) const;

    // Compare stored suffix to querying suffix.
    // kReal suffix type only.
    int compare(const position_t idx, const std::string& key, const level_t level) const;

    void serialize(char*& dst) const {
	memcpy(dst, &num_bits_, sizeof(num_bits_));
	dst += sizeof(num_bits_);
	memcpy(dst, &type_, sizeof(type_));
	dst += sizeof(type_);
	memcpy(dst, &hash_suffix_len_, sizeof(hash_suffix_len_));
	dst += sizeof(hash_suffix_len_);
        memcpy(dst, &real_suffix_len_, sizeof(real_suffix_len_));
	dst += sizeof(real_suffix_len_);
	if (type_ != kNone) {
	    memcpy(dst, bits_, bitsSize());
	    dst += bitsSize();
	}
	align(dst);
    }

    static BitvectorSuffix* deSerialize(char*& src) {
	BitvectorSuffix* sv = new BitvectorSuffix();
	memcpy(&(sv->num_bits_), src, sizeof(sv->num_bits_));
	src += sizeof(sv->num_bits_);
	memcpy(&(sv->type_), src, sizeof(sv->type_));
	src += sizeof(sv->type_);
	memcpy(&(sv->hash_suffix_len_), src, sizeof(sv->hash_suffix_len_));
	src += sizeof(sv->hash_suffix_len_);
        memcpy(&(sv->real_suffix_len_), src, sizeof(sv->real_suffix_len_));
	src += sizeof(sv->real_suffix_len_);
	if (sv->type_ != kNone) {
	    sv->bits_ = new word_t[sv->numWords()];
	    memcpy(sv->bits_, src, sv->bitsSize());
	    src += sv->bitsSize();
	    
	    //sv->bits_ = const_cast<word_t*>(reinterpret_cast<const word_t*>(src));
	    //src += sv->bitsSize();
	}
	align(src);
	return sv;
    }

    void destroy() {
	if (type_ != kNone)
	    delete[] bits_;
    }

private:
    SuffixType type_;
    level_t hash_suffix_len_; // in bits
    level_t real_suffix_len_; // in bits
};

word_t BitvectorSuffix::read(const position_t idx) const {
    if (type_ == kNone) 
	return 0;

    level_t suffix_len = getSuffixLen();
    if (idx * suffix_len >= num_bits_) 
	return 0;

    position_t bit_pos = idx * suffix_len;
    position_t word_id = bit_pos / kWordSize;
    position_t offset = bit_pos & (kWordSize - 1);
    word_t ret_word = (bits_[word_id] << offset) >> (kWordSize - suffix_len);
    if (offset + suffix_len > kWordSize)
	ret_word += (bits_[word_id+1] >> (kWordSize - offset - suffix_len));
    return ret_word;
}

word_t BitvectorSuffix::readReal(const position_t idx) const {
    return extractRealSuffix(read(idx), real_suffix_len_);
}

bool BitvectorSuffix::checkEquality(const position_t idx, 
				    const std::string& key, const level_t level) const {
    if (type_ == kNone) 
	return true;
    if (idx * getSuffixLen() >= num_bits_) 
	return false;

    word_t stored_suffix = read(idx);
    if (type_ == kReal) {
	// if no suffix info for the stored key
	if (stored_suffix == 0) 
	    return true;
	// if the querying key is shorter than the stored key
	if (key.length() < level || ((key.length() - level) * 8) < real_suffix_len_) 
	    return false;
    }
    word_t querying_suffix 
	= constructSuffix(type_, key, hash_suffix_len_, level, real_suffix_len_);
    return (stored_suffix == querying_suffix);
}

// If no real suffix is stored for the key, compare returns 0.
// int BitvectorSuffix::compare(const position_t idx, 
// 			     const std::string& key, const level_t level) const {
//     if ((type_ == kNone) || (type_ == kHash) || (idx * getSuffixLen() >= num_bits_))
// 	return 0;
//     word_t stored_suffix = read(idx);
//     word_t querying_suffix = constructRealSuffix(key, level, real_suffix_len_);
//     if (type_ == kMixed)
//         stored_suffix = extractRealSuffix(stored_suffix, real_suffix_len_);

//     if (stored_suffix == 0) 
// 	return 0;
//     if (stored_suffix < querying_suffix) 
// 	return -1;
//     else if (stored_suffix == querying_suffix) 
// 	return 0;
//     else 
// 	return 1;
// }

int BitvectorSuffix::compare(const position_t idx, 
			     const std::string& key, const level_t level) const {
    if ((idx * getSuffixLen() >= num_bits_) || (type_ == kNone) || (type_ == kHash))
	return kCouldBePositive;

    word_t stored_suffix = read(idx);
    word_t querying_suffix = constructRealSuffix(key, level, real_suffix_len_);
    if (type_ == kMixed)
        stored_suffix = extractRealSuffix(stored_suffix, real_suffix_len_);

    if ((stored_suffix == 0) && (querying_suffix == 0))
	return kCouldBePositive;
    else if ((stored_suffix == 0) || (stored_suffix < querying_suffix))
	return -1;
    else if (stored_suffix == querying_suffix) 
	return kCouldBePositive;
    else 
	return 1;
}

} // namespace surf

#endif // SUFFIXVECTOR_H_


================================================
FILE: include/surf.hpp
================================================
#ifndef SURF_H_
#define SURF_H_

#include <string>
#include <vector>

#include "config.hpp"
#include "louds_dense.hpp"
#include "louds_sparse.hpp"
#include "surf_builder.hpp"

namespace surf {

class SuRF {
public:
    class Iter {
    public:
	Iter() {};
	Iter(const SuRF* filter) {
	    dense_iter_ = LoudsDense::Iter(filter->louds_dense_);
	    sparse_iter_ = LoudsSparse::Iter(filter->louds_sparse_);
	    could_be_fp_ = false;
	}

	void clear();
	bool isValid() const;
	bool getFpFlag() const;
	int compare(const std::string& key) const;
	std::string getKey() const;
	int getSuffix(word_t* suffix) const;
	std::string getKeyWithSuffix(unsigned* bitlen) const;

	// Returns true if the status of the iterator after the operation is valid
	bool operator ++(int);
	bool operator --(int);

    private:
	void passToSparse();
	bool incrementDenseIter();
	bool incrementSparseIter();
	bool decrementDenseIter();
	bool decrementSparseIter();

    private:
	// true implies that dense_iter_ is valid
	LoudsDense::Iter dense_iter_;
	LoudsSparse::Iter sparse_iter_;
	bool could_be_fp_;

	friend class SuRF;
    };

public:
    SuRF() {};

    //------------------------------------------------------------------
    // Input keys must be SORTED
    //------------------------------------------------------------------
    SuRF(const std::vector<std::string>& keys) {
	create(keys, kIncludeDense, kSparseDenseRatio, kNone, 0, 0);
    }

    SuRF(const std::vector<std::string>& keys, const SuffixType suffix_type,
	 const level_t hash_suffix_len, const level_t real_suffix_len) {
	create(keys, kIncludeDense, kSparseDenseRatio, suffix_type, hash_suffix_len, real_suffix_len);
    }
    
    SuRF(const std::vector<std::string>& keys,
	 const bool include_dense, const uint32_t sparse_dense_ratio,
	 const SuffixType suffix_type, const level_t hash_suffix_len, const level_t real_suffix_len) {
	create(keys, include_dense, sparse_dense_ratio, suffix_type, hash_suffix_len, real_suffix_len);
    }

    ~SuRF() {}

    void create(const std::vector<std::string>& keys,
		const bool include_dense, const uint32_t sparse_dense_ratio,
		const SuffixType suffix_type,
                const level_t hash_suffix_len, const level_t real_suffix_len);

    bool lookupKey(const std::string& key) const;
    // This function searches in a conservative way: if inclusive is true
    // and the stored key prefix matches key, iter stays at this key prefix.
    SuRF::Iter moveToKeyGreaterThan(const std::string& key, const bool inclusive) const;
    SuRF::Iter moveToKeyLessThan(const std::string& key, const bool inclusive) const;
    SuRF::Iter moveToFirst() const;
    SuRF::Iter moveToLast() const;
    bool lookupRange(const std::string& left_key, const bool left_inclusive, 
		     const std::string& right_key, const bool right_inclusive);
    // Accurate except at the boundaries --> undercount by at most 2
    uint64_t approxCount(const std::string& left_key, const std::string& right_key);
    uint64_t approxCount(const SuRF::Iter* iter, const SuRF::Iter* iter2);

    uint64_t serializedSize() const;
    uint64_t getMemoryUsage() const;
    level_t getHeight() const;
    level_t getSparseStartLevel() const;

    char* serialize() const {
	uint64_t size = serializedSize();
	char* data = new char[size];
	char* cur_data = data;
	louds_dense_->serialize(cur_data);
	louds_sparse_->serialize(cur_data);
	assert(cur_data - data == (int64_t)size);
	return data;
    }

    static SuRF* deSerialize(char* src) {
	SuRF* surf = new SuRF();
	surf->louds_dense_ = LoudsDense::deSerialize(src);
	surf->louds_sparse_ = LoudsSparse::deSerialize(src);
	surf->iter_ = SuRF::Iter(surf);
	return surf;
    }

    void destroy() {
	louds_dense_->destroy();
	louds_sparse_->destroy();
    }

private:
    LoudsDense* louds_dense_;
    LoudsSparse* louds_sparse_;
    SuRFBuilder* builder_;
    SuRF::Iter iter_;
    SuRF::Iter iter2_;
};

void SuRF::create(const std::vector<std::string>& keys, 
		  const bool include_dense, const uint32_t sparse_dense_ratio,
		  const SuffixType suffix_type,
                  const level_t hash_suffix_len, const level_t real_suffix_len) {
    builder_ = new SuRFBuilder(include_dense, sparse_dense_ratio,
                              suffix_type, hash_suffix_len, real_suffix_len);
    builder_->build(keys);
    louds_dense_ = new LoudsDense(builder_);
    louds_sparse_ = new LoudsSparse(builder_);
    iter_ = SuRF::Iter(this);
    delete builder_;
}

bool SuRF::lookupKey(const std::string& key) const {
    position_t connect_node_num = 0;
    if (!louds_dense_->lookupKey(key, connect_node_num))
	return false;
    else if (connect_node_num != 0)
	return louds_sparse_->lookupKey(key, connect_node_num);
    return true;
}

SuRF::Iter SuRF::moveToKeyGreaterThan(const std::string& key, const bool inclusive) const {
    SuRF::Iter iter(this);
    iter.could_be_fp_ = louds_dense_->moveToKeyGreaterThan(key, inclusive, iter.dense_iter_);

    if (!iter.dense_iter_.isValid())
	return iter;
    if (iter.dense_iter_.isComplete())
	return iter;

    if (!iter.dense_iter_.isSearchComplete()) {
	iter.passToSparse();
	iter.could_be_fp_ = louds_sparse_->moveToKeyGreaterThan(key, inclusive, iter.sparse_iter_);
	if (!iter.sparse_iter_.isValid())
	    iter.incrementDenseIter();
	return iter;
    } else if (!iter.dense_iter_.isMoveLeftComplete()) {
	iter.passToSparse();
	iter.sparse_iter_.moveToLeftMostKey();
	return iter;
    }

    assert(false); // shouldn't reach here
    return iter;
}

SuRF::Iter SuRF::moveToKeyLessThan(const std::string& key, const bool inclusive) const {
    SuRF::Iter iter = moveToKeyGreaterThan(key, false);
    if (!iter.isValid()) {
	iter = moveToLast();
	return iter;
    }
    if (!iter.getFpFlag()) {
	iter--;
	if (lookupKey(key))
	    iter--;
    }
    return iter;
}

SuRF::Iter SuRF::moveToFirst() const {
    SuRF::Iter iter(this);
    if (louds_dense_->getHeight() > 0) {
	iter.dense_iter_.setToFirstLabelInRoot();
	iter.dense_iter_.moveToLeftMostKey();
	if (iter.dense_iter_.isMoveLeftComplete())
	    return iter;
	iter.passToSparse();
	iter.sparse_iter_.moveToLeftMostKey();
    } else {
	iter.sparse_iter_.setToFirstLabelInRoot();
	iter.sparse_iter_.moveToLeftMostKey();
    }
    return iter;
}

SuRF::Iter SuRF::moveToLast() const {
    SuRF::Iter iter(this);
    if (louds_dense_->getHeight() > 0) {
	iter.dense_iter_.setToLastLabelInRoot();
	iter.dense_iter_.moveToRightMostKey();
	if (iter.dense_iter_.isMoveRightComplete())
	    return iter;
	iter.passToSparse();
	iter.sparse_iter_.moveToRightMostKey();
    } else {
	iter.sparse_iter_.setToLastLabelInRoot();
	iter.sparse_iter_.moveToRightMostKey();
    }
    return iter;
}

bool SuRF::lookupRange(const std::string& left_key, const bool left_inclusive, 
		       const std::string& right_key, const bool right_inclusive) {
    iter_.clear();
    louds_dense_->moveToKeyGreaterThan(left_key, left_inclusive, iter_.dense_iter_);
    if (!iter_.dense_iter_.isValid()) return false;
    if (!iter_.dense_iter_.isComplete()) {
	if (!iter_.dense_iter_.isSearchComplete()) {
	    iter_.passToSparse();
	    louds_sparse_->moveToKeyGreaterThan(left_key, left_inclusive, iter_.sparse_iter_);
	    if (!iter_.sparse_iter_.isValid()) {
		iter_.incrementDenseIter();
	    }
	} else if (!iter_.dense_iter_.isMoveLeftComplete()) {
	    iter_.passToSparse();
	    iter_.sparse_iter_.moveToLeftMostKey();
	}
    }
    if (!iter_.isValid()) return false;
    int compare = iter_.compare(right_key);
    if (compare == kCouldBePositive)
	return true;
    if (right_inclusive)
	return (compare <= 0);
    else
	return (compare < 0);
}

uint64_t SuRF::approxCount(const SuRF::Iter* iter, const SuRF::Iter* iter2) {
    if (!iter->isValid() || !iter2->isValid()) return 0;
    position_t out_node_num_left = 0, out_node_num_right = 0;
    uint64_t count = louds_dense_->approxCount(&(iter->dense_iter_),
					       &(iter2->dense_iter_),
					       out_node_num_left,
					       out_node_num_right);
    count += louds_sparse_->approxCount(&(iter->sparse_iter_),
					&(iter2->sparse_iter_),
					out_node_num_left,
					out_node_num_right);
    return count;
}

uint64_t SuRF::a

Download .txt

gitextract_dyvduj7s/

├── .gitignore
├── .gitmodules
├── .travis.yml
├── CMakeLists.txt
├── CodeCoverage.cmake
├── LICENSE
├── README.md
├── bench/
│   ├── CMakeLists.txt
│   ├── MurmurHash3.h
│   ├── bench.hpp
│   ├── bloom.hpp
│   ├── filter.hpp
│   ├── filter_bloom.hpp
│   ├── filter_factory.hpp
│   ├── filter_surf.hpp
│   ├── run.sh
│   ├── workload.cpp
│   ├── workload_arf.cpp
│   ├── workload_gen/
│   │   ├── gen_load.py
│   │   ├── gen_txn.py
│   │   ├── gen_workload.sh
│   │   ├── workload_spec/
│   │   │   ├── workload_template
│   │   │   ├── workloadc_email_latest
│   │   │   ├── workloadc_email_uniform
│   │   │   ├── workloadc_email_zipfian
│   │   │   ├── workloadc_randint_latest
│   │   │   ├── workloadc_randint_uniform
│   │   │   └── workloadc_randint_zipfian
│   │   └── ycsb_download.sh
│   └── workload_multi_thread.cpp
├── include/
│   ├── bitvector.hpp
│   ├── config.hpp
│   ├── hash.hpp
│   ├── label_vector.hpp
│   ├── louds_dense.hpp
│   ├── louds_sparse.hpp
│   ├── popcount.h
│   ├── rank.hpp
│   ├── select.hpp
│   ├── suffix.hpp
│   ├── surf.hpp
│   └── surf_builder.hpp
├── simple_example.cpp
├── src/
│   └── CMakeLists.txt
└── test/
    ├── CMakeLists.txt
    ├── unitTest/
    │   ├── CMakeLists.txt
    │   ├── test_bitvector.cpp
    │   ├── test_label_vector.cpp
    │   ├── test_louds_dense.cpp
    │   ├── test_louds_dense_small.cpp
    │   ├── test_louds_sparse.cpp
    │   ├── test_louds_sparse_small.cpp
    │   ├── test_rank.cpp
    │   ├── test_select.cpp
    │   ├── test_suffix.cpp
    │   ├── test_suffix_vector.cpp
    │   ├── test_surf.cpp
    │   ├── test_surf_builder.cpp
    │   └── test_surf_small.cpp
    └── words.txt

Download .txt

SYMBOL INDEX (376 symbols across 38 files)

FILE: bench/MurmurHash3.h
  function rotl32 (line 26) | inline uint32_t rotl32 ( uint32_t x, int8_t r )
  function rotl64 (line 31) | inline uint64_t rotl64 ( uint64_t x, int8_t r )
  function FORCE_INLINE (line 45) | FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
  function FORCE_INLINE (line 50) | FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
  function FORCE_INLINE (line 58) | FORCE_INLINE uint32_t fmix32 ( uint32_t h )
  function FORCE_INLINE (line 71) | FORCE_INLINE uint64_t fmix64 ( uint64_t k )
  function MurmurHash3_x86_32 (line 84) | void MurmurHash3_x86_32 ( const void * key, int len,
  function MurmurHash3_x86_128 (line 140) | void MurmurHash3_x86_128 ( const void * key, const int len,
  function MurmurHash3_x64_128 (line 245) | void MurmurHash3_x64_128 ( const void * key, const int len,

FILE: bench/bench.hpp
  type bench (line 20) | namespace bench {
    function getNow (line 37) | double getNow() {
    function uint64ToString (line 43) | std::string uint64ToString(uint64_t key) {
    function stringToUint64 (line 48) | uint64_t stringToUint64(std::string str_key) {
    function loadKeysFromFile (line 54) | void loadKeysFromFile(const std::string& file_name, const bool is_key_...
    function loadKeysFromFile (line 76) | void loadKeysFromFile(const std::string& file_name, uint64_t num_records,
    function selectKeysToInsert (line 89) | void selectKeysToInsert(const unsigned percent,
    function selectIntKeysToInsert (line 102) | void selectIntKeysToInsert(const unsigned percent,
    function modifyKeyByte (line 115) | void modifyKeyByte(std::vector<std::string> &keys, int pos) {
    function getUpperBoundKey (line 125) | std::string getUpperBoundKey(const std::string& key_type, const std::s...

FILE: bench/bloom.hpp
  function DecodeFixed32 (line 20) | inline uint32_t DecodeFixed32(const char* ptr) {
  function BloomHash (line 58) | static void BloomHash(const string &key, uint32_t* out) {
  function BloomHash (line 62) | static void BloomHash(const uint64_t key, uint32_t* out) {
  class BloomFilter (line 66) | class BloomFilter {
    method BloomFilter (line 72) | BloomFilter(int bits_per_key)
    method CreateFilter (line 80) | void CreateFilter(vector<string> keys, int n, string* dst) const {
    method CreateFilter (line 111) | void CreateFilter(vector<uint64_t> keys, int n, string* dst) const {
    method KeyMayMatch (line 142) | bool KeyMayMatch(const string& key, const string& bloom_filter) const {
    method KeyMayMatch (line 170) | bool KeyMayMatch(const uint64_t key, const string& bloom_filter) const {

FILE: bench/filter.hpp
  type bench (line 7) | namespace bench {
    class Filter (line 9) | class Filter {

FILE: bench/filter_bloom.hpp
  type bench (line 9) | namespace bench {
    class FilterBloom (line 11) | class FilterBloom : public Filter {
      method FilterBloom (line 14) | FilterBloom(const std::vector<std::string>& keys) {
      method lookup (line 23) | bool lookup(const std::string& key) {
      method lookupRange (line 27) | bool lookupRange(const std::string& left_key, const std::string& rig...
      method approxCount (line 32) | bool approxCount(const std::string& left_key, const std::string& rig...
      method getMemoryUsage (line 37) | uint64_t getMemoryUsage() {

FILE: bench/filter_factory.hpp
  type bench (line 8) | namespace bench {
    class FilterFactory (line 10) | class FilterFactory {
      method Filter (line 12) | static Filter* createFilter(const std::string& filter_type,

FILE: bench/filter_surf.hpp
  type bench (line 9) | namespace bench {
    class FilterSuRF (line 11) | class FilterSuRF : public Filter {
      method FilterSuRF (line 14) | FilterSuRF(const std::vector<std::string>& keys,
      method lookup (line 27) | bool lookup(const std::string& key) {
      method lookupRange (line 31) | bool lookupRange(const std::string& left_key, const std::string& rig...
      method approxCount (line 36) | bool approxCount(const std::string& left_key, const std::string& rig...
      method getMemoryUsage (line 40) | uint64_t getMemoryUsage() {

FILE: bench/workload.cpp
  function main (line 4) | int main(int argc, char *argv[]) {

FILE: bench/workload_arf.cpp
  function main (line 13) | int main(int argc, char *argv[]) {

FILE: bench/workload_gen/gen_load.py
  class bcolors (line 4) | class bcolors:
  function reverseHostName (line 16) | def reverseHostName ( email ) :

FILE: bench/workload_gen/gen_txn.py
  class bcolors (line 4) | class bcolors:
  function reverseHostName (line 16) | def reverseHostName ( email ) :

FILE: bench/workload_multi_thread.cpp
  type ThreadArg (line 9) | struct ThreadArg {
  function main (line 47) | int main(int argc, char *argv[]) {

FILE: include/bitvector.hpp
  type surf (line 10) | namespace surf {
    class Bitvector (line 12) | class Bitvector {
      method Bitvector (line 14) | Bitvector() : num_bits_(0), bits_(nullptr) {}
      method Bitvector (line 16) | Bitvector(const std::vector<std::vector<word_t> >& bitvector_per_level,
      method position_t (line 30) | position_t numBits() const {
      method position_t (line 34) | position_t numWords() const {
      method position_t (line 42) | position_t bitsSize() const {
      method position_t (line 47) | position_t size() const {
    function position_t (line 77) | position_t Bitvector::distanceToNextSetBit (const position_t pos) const {
    function position_t (line 104) | position_t Bitvector::distanceToPrevSetBit (const position_t pos) const {
    function position_t (line 132) | position_t Bitvector::totalNumBits(const std::vector<position_t>& num_...

FILE: include/config.hpp
  type surf (line 7) | namespace surf {
    type SuffixType (line 30) | enum SuffixType {
    function align (line 37) | void align(char*& ptr) {
    function sizeAlign (line 41) | void sizeAlign(position_t& size) {
    function sizeAlign (line 45) | void sizeAlign(uint64_t& size) {
    function uint64ToString (line 49) | std::string uint64ToString(const uint64_t word) {
    function stringToUint64 (line 54) | uint64_t stringToUint64(const std::string& str_word) {

FILE: include/hash.hpp
  type surf (line 6) | namespace surf {
    function DecodeFixed32 (line 11) | inline uint32_t DecodeFixed32(const char* ptr) {
    function Hash (line 17) | inline uint32_t Hash(const char* data, size_t n, uint32_t seed) {
    function suffixHash (line 48) | inline uint32_t suffixHash(const std::string &key) {
    function suffixHash (line 52) | inline uint32_t suffixHash(const char* key, const int keylen) {

FILE: include/label_vector.hpp
  type surf (line 10) | namespace surf {
    class LabelVector (line 12) | class LabelVector {
      method LabelVector (line 14) | LabelVector() : num_bytes_(0), labels_(nullptr) {}
      method LabelVector (line 16) | LabelVector(const std::vector<std::vector<label_t> >& labels_per_level,
      method position_t (line 43) | position_t getNumBytes() const {
      method position_t (line 47) | position_t serializedSize() const {
      method position_t (line 53) | position_t size() const {
      method label_t (line 57) | label_t read(const position_t pos) const {
      method label_t (line 61) | label_t operator[](const position_t pos) const {
      method serialize (line 75) | void serialize(char*& dst) const {
      method LabelVector (line 83) | static LabelVector* deSerialize(char*& src) {
      method destroy (line 98) | void destroy() {

FILE: include/louds_dense.hpp
  type surf (line 11) | namespace surf {
    class LoudsDense (line 13) | class LoudsDense {
      class Iter (line 15) | class Iter {
        method Iter (line 17) | Iter() : is_valid_(false) {}
        method Iter (line 18) | Iter(LoudsDense* trie) : is_valid_(false), is_search_complete_(fal...
        method isValid (line 31) | bool isValid() const { return is_valid_; }
        method isSearchComplete (line 32) | bool isSearchComplete() const { return is_search_complete_; }
        method isMoveLeftComplete (line 33) | bool isMoveLeftComplete() const { return is_move_left_complete_; }
        method isMoveRightComplete (line 34) | bool isMoveRightComplete() const { return is_move_right_complete_; }
        method isComplete (line 35) | bool isComplete() const {
        method position_t (line 44) | position_t getSendOutNodeNum() const { return send_out_node_num_; }
        method setSendOutNodeNum (line 56) | inline void setSendOutNodeNum(position_t node_num) { send_out_node...
      method LoudsDense (line 83) | LoudsDense() {}
      method getHeight (line 99) | uint64_t getHeight() const { return height_; }
      method serialize (line 103) | void serialize(char*& dst) const {
      method LoudsDense (line 116) | static LoudsDense* deSerialize(char*& src) {
      method destroy (line 133) | void destroy() {
    function position_t (line 385) | position_t LoudsDense::getChildNodeNum(const position_t pos) const {
    function position_t (line 389) | position_t LoudsDense::getSuffixPos(const position_t pos, const bool i...
    function position_t (line 400) | position_t LoudsDense::getNextPos(const position_t pos) const {
    function position_t (line 404) | position_t LoudsDense::getPrevPos(const position_t pos, bool* is_out_o...

FILE: include/louds_sparse.hpp
  type surf (line 13) | namespace surf {
    class LoudsSparse (line 15) | class LoudsSparse {
      class Iter (line 17) | class Iter {
        method Iter (line 19) | Iter() : is_valid_(false) {}
        method Iter (line 20) | Iter(LoudsSparse* trie) : is_valid_(false), trie_(trie), start_nod...
        method isValid (line 30) | bool isValid() const { return is_valid_; }
        method position_t (line 36) | position_t getStartNodeNum() const { return start_node_num_; }
        method setStartNodeNum (line 37) | void setStartNodeNum(position_t node_num) { start_node_num_ = node...
      method LoudsSparse (line 66) | LoudsSparse() {}
      method level_t (line 82) | level_t getHeight() const { return height_; }
      method level_t (line 83) | level_t getStartLevel() const { return start_level_; }
      method serialize (line 87) | void serialize(char*& dst) const {
      method LoudsSparse (line 106) | static LoudsSparse* deSerialize(char*& src) {
      method destroy (line 129) | void destroy() {
    function position_t (line 296) | position_t LoudsSparse::appendToPosList(std::vector<position_t>& pos_l...
    function position_t (line 421) | position_t LoudsSparse::getChildNodeNum(const position_t pos) const {
    function position_t (line 425) | position_t LoudsSparse::getFirstLabelPos(const position_t node_num) co...
    function position_t (line 429) | position_t LoudsSparse::getLastLabelPos(const position_t node_num) con...
    function position_t (line 436) | position_t LoudsSparse::getSuffixPos(const position_t pos) const {
    function position_t (line 440) | position_t LoudsSparse::nodeSize(const position_t pos) const {

FILE: include/popcount.h
  function namespace (line 9) | namespace surf {

FILE: include/rank.hpp
  type surf (line 12) | namespace surf {
    class BitvectorRank (line 14) | class BitvectorRank : public Bitvector {
      method BitvectorRank (line 16) | BitvectorRank() : basic_block_size_(0), rank_lut_(nullptr) {}
      method BitvectorRank (line 18) | BitvectorRank(const position_t basic_block_size,
      method position_t (line 33) | position_t rank(position_t pos) const {
      method position_t (line 42) | position_t rankLutSize() const {
      method position_t (line 46) | position_t serializedSize() const {
      method position_t (line 53) | position_t size() const {
      method prefetch (line 57) | void prefetch(position_t pos) const {
      method serialize (line 62) | void serialize(char*& dst) const {
      method BitvectorRank (line 74) | static BitvectorRank* deSerialize(char*& src) {
      method destroy (line 97) | void destroy() {
      method initRankLut (line 103) | void initRankLut() {

FILE: include/select.hpp
  type surf (line 13) | namespace surf {
    class BitvectorSelect (line 15) | class BitvectorSelect : public Bitvector {
      method BitvectorSelect (line 17) | BitvectorSelect() : sample_interval_(0), num_ones_(0), select_lut_(n...
      method BitvectorSelect (line 19) | BitvectorSelect(const position_t sample_interval,
      method position_t (line 34) | position_t select(position_t rank) const {
      method position_t (line 68) | position_t selectLutSize() const {
      method position_t (line 72) | position_t serializedSize() const {
      method position_t (line 79) | position_t size() const {
      method position_t (line 83) | position_t numOnes() const {
      method serialize (line 87) | void serialize(char*& dst) const {
      method BitvectorSelect (line 101) | static BitvectorSelect* deSerialize(char*& src) {
      method destroy (line 125) | void destroy() {
      method initSelectLut (line 133) | void initSelectLut() {

FILE: include/suffix.hpp
  type surf (line 13) | namespace surf {
    class BitvectorSuffix (line 19) | class BitvectorSuffix : public Bitvector {
      method BitvectorSuffix (line 21) | BitvectorSuffix() : type_(kNone), hash_suffix_len_(0), real_suffix_l...
      method BitvectorSuffix (line 23) | BitvectorSuffix(const SuffixType type,
      method word_t (line 36) | static word_t constructHashSuffix(const std::string& key, const leve...
      method word_t (line 43) | static word_t constructRealSuffix(const std::string& key,
      method word_t (line 67) | static word_t constructMixedSuffix(const std::string& key, const lev...
      method word_t (line 77) | static word_t constructSuffix(const SuffixType type, const std::stri...
      method word_t (line 92) | static word_t extractHashSuffix(const word_t suffix, const level_t r...
      method word_t (line 96) | static word_t extractRealSuffix(const word_t suffix, const level_t r...
      method SuffixType (line 103) | SuffixType getType() const {
      method level_t (line 107) | level_t getSuffixLen() const {
      method level_t (line 111) | level_t getHashSuffixLen() const {
      method level_t (line 115) | level_t getRealSuffixLen() const {
      method position_t (line 119) | position_t serializedSize() const {
      method position_t (line 126) | position_t size() const {
      method serialize (line 138) | void serialize(char*& dst) const {
      method BitvectorSuffix (line 154) | static BitvectorSuffix* deSerialize(char*& src) {
      method destroy (line 176) | void destroy() {
    function word_t (line 187) | word_t BitvectorSuffix::read(const position_t idx) const {
    function word_t (line 204) | word_t BitvectorSuffix::readReal(const position_t idx) const {

FILE: include/surf.hpp
  type surf (line 12) | namespace surf {
    class SuRF (line 14) | class SuRF {
      class Iter (line 16) | class Iter {
        method Iter (line 18) | Iter() {}
        method Iter (line 19) | Iter(const SuRF* filter) {
      method SuRF (line 54) | SuRF() {}
      method SuRF (line 59) | SuRF(const std::vector<std::string>& keys) {
      method SuRF (line 63) | SuRF(const std::vector<std::string>& keys, const SuffixType suffix_t...
      method SuRF (line 68) | SuRF(const std::vector<std::string>& keys,
      method SuRF (line 109) | static SuRF* deSerialize(char* src) {
      method destroy (line 117) | void destroy() {
    function level_t (line 285) | level_t SuRF::getHeight() const {
    function level_t (line 289) | level_t SuRF::getSparseStartLevel() const {

FILE: include/surf_builder.hpp
  type surf (line 13) | namespace surf {
    class SuRFBuilder (line 15) | class SuRFBuilder {
      method SuRFBuilder (line 17) | SuRFBuilder() : sparse_start_level_(0), suffix_type_(kNone) {}
      method SuRFBuilder (line 18) | explicit SuRFBuilder(bool include_dense, uint32_t sparse_dense_ratio,
      method readBit (line 32) | static bool readBit(const std::vector<word_t>& bits, const position_...
      method setBit (line 39) | static void setBit(std::vector<word_t>& bits, const position_t pos) {
      method level_t (line 46) | level_t getTreeHeight() const {
      method level_t (line 78) | level_t getSparseStartLevel() const {
      method SuffixType (line 81) | SuffixType getSuffixType() const {
      method level_t (line 84) | level_t getSuffixLen() const {
      method level_t (line 87) | level_t getHashSuffixLen() const {
      method level_t (line 90) | level_t getRealSuffixLen() const {
      method isSameKey (line 95) | static bool isSameKey(const std::string& a, const std::string& b) {
    function level_t (line 199) | level_t SuRFBuilder::skipCommonPrefix(const std::string& key) {
    function level_t (line 208) | level_t SuRFBuilder::insertKeyBytesToTrieUntilUnique(const std::string...
    function position_t (line 413) | position_t SuRFBuilder::getNumItems(const level_t level) const {

FILE: simple_example.cpp
  function main (line 8) | int main() {

FILE: test/unitTest/test_bitvector.cpp
  type surf (line 13) | namespace surf {
    type bitvectortest (line 15) | namespace bitvectortest {
      class BitvectorUnitTest (line 21) | class BitvectorUnitTest : public ::testing::Test {
        method SetUp (line 23) | virtual void SetUp () {
        method TearDown (line 30) | virtual void TearDown () {
      function TEST_F (line 68) | TEST_F (BitvectorUnitTest, readBitTest) {
      function TEST_F (line 152) | TEST_F (BitvectorUnitTest, distanceToNextSetBitTest) {
      function TEST_F (line 179) | TEST_F (BitvectorUnitTest, distanceToPrevSetBitTest) {
      function loadWordList (line 211) | void loadWordList() {
  function main (line 226) | int main (int argc, char** argv) {

FILE: test/unitTest/test_label_vector.cpp
  type surf (line 13) | namespace surf {
    type labelvectortest (line 15) | namespace labelvectortest {
      class LabelVectorUnitTest (line 21) | class LabelVectorUnitTest : public ::testing::Test {
        method SetUp (line 23) | virtual void SetUp () {
        method TearDown (line 30) | virtual void TearDown () {
      function TEST_F (line 107) | TEST_F (LabelVectorUnitTest, readTest) {
      function TEST_F (line 122) | TEST_F (LabelVectorUnitTest, searchAlgTest) {
      function TEST_F (line 190) | TEST_F (LabelVectorUnitTest, searchTest) {
      function TEST_F (line 197) | TEST_F (LabelVectorUnitTest, serializeTest) {
      function TEST_F (line 203) | TEST_F (LabelVectorUnitTest, searchGreaterThanTest) {
      function loadWordList (line 251) | void loadWordList() {
  function main (line 266) | int main (int argc, char** argv) {

FILE: test/unitTest/test_louds_dense.cpp
  type surf (line 13) | namespace surf {
    type densetest (line 15) | namespace densetest {
      class DenseUnitTest (line 30) | class DenseUnitTest : public ::testing::Test {
        method SetUp (line 32) | virtual void SetUp () {
        method TearDown (line 37) | virtual void TearDown () {
      function getCommonPrefixLen (line 55) | static int getCommonPrefixLen(const std::string &a, const std::strin...
      function getMax (line 62) | static int getMax(int a, int b) {
      function TEST_F (line 142) | TEST_F (DenseUnitTest, lookupWordTest) {
      function TEST_F (line 156) | TEST_F (DenseUnitTest, serializeTest) {
      function TEST_F (line 169) | TEST_F (DenseUnitTest, lookupIntTest) {
      function TEST_F (line 190) | TEST_F (DenseUnitTest, moveToKeyGreaterThanWordTest) {
      function TEST_F (line 237) | TEST_F (DenseUnitTest, moveToKeyGreaterThanIntTest) {
      function TEST_F (line 283) | TEST_F (DenseUnitTest, IteratorIncrementWordTest) {
      function TEST_F (line 306) | TEST_F (DenseUnitTest, IteratorIncrementIntTest) {
      function TEST_F (line 329) | TEST_F (DenseUnitTest, IteratorDecrementWordTest) {
      function TEST_F (line 352) | TEST_F (DenseUnitTest, IteratorDecrementIntTest) {
      function TEST_F (line 376) | TEST_F (DenseUnitTest, approxCountWordTest) {
      function TEST_F (line 404) | TEST_F (DenseUnitTest, approxCountIntTest) {
      function loadWordList (line 432) | void loadWordList() {
  function main (line 447) | int main (int argc, char** argv) {

FILE: test/unitTest/test_louds_dense_small.cpp
  type surf (line 11) | namespace surf {
    type surftest (line 13) | namespace surftest {
      class SuRFSmallTest (line 20) | class SuRFSmallTest : public ::testing::Test {
        method SetUp (line 22) | virtual void SetUp () {}
        method TearDown (line 23) | virtual void TearDown () {}
      function TEST_F (line 26) | TEST_F (SuRFSmallTest, ExampleInPaperTest) {
  function main (line 62) | int main (int argc, char** argv) {

FILE: test/unitTest/test_louds_sparse.cpp
  type surf (line 13) | namespace surf {
    type sparsetest (line 15) | namespace sparsetest {
      class SparseUnitTest (line 30) | class SparseUnitTest : public ::testing::Test {
        method SetUp (line 32) | virtual void SetUp () {
        method TearDown (line 37) | virtual void TearDown () {
      function getCommonPrefixLen (line 55) | static int getCommonPrefixLen(const std::string &a, const std::strin...
      function getMax (line 62) | static int getMax(int a, int b) {
      function TEST_F (line 143) | TEST_F (SparseUnitTest, lookupWordTest) {
      function TEST_F (line 158) | TEST_F (SparseUnitTest, serializeTest) {
      function TEST_F (line 172) | TEST_F (SparseUnitTest, lookupIntTest) {
      function TEST_F (line 190) | TEST_F (SparseUnitTest, moveToKeyGreaterThanWordTest) {
      function TEST_F (line 236) | TEST_F (SparseUnitTest, moveToKeyGreaterThanIntTest) {
      function TEST_F (line 281) | TEST_F (SparseUnitTest, IteratorIncrementWordTest) {
      function TEST_F (line 303) | TEST_F (SparseUnitTest, IteratorIncrementIntTest) {
      function TEST_F (line 325) | TEST_F (SparseUnitTest, IteratorDecrementWordTest) {
      function TEST_F (line 347) | TEST_F (SparseUnitTest, IteratorDecrementIntTest) {
      function TEST_F (line 370) | TEST_F (SparseUnitTest, FirstAndLastLabelInRootTest) {
      function TEST_F (line 395) | TEST_F (SparseUnitTest, approxCountWordTest) {
      function TEST_F (line 421) | TEST_F (SparseUnitTest, approxCountIntTest) {
      function loadWordList (line 447) | void loadWordList() {
  function main (line 462) | int main (int argc, char** argv) {

FILE: test/unitTest/test_louds_sparse_small.cpp
  type surf (line 11) | namespace surf {
    type surftest (line 13) | namespace surftest {
      class SuRFSmallTest (line 20) | class SuRFSmallTest : public ::testing::Test {
        method SetUp (line 22) | virtual void SetUp () {}
        method TearDown (line 23) | virtual void TearDown () {}
      function TEST_F (line 26) | TEST_F (SuRFSmallTest, ExampleInPaperTest) {
  function main (line 57) | int main (int argc, char** argv) {

FILE: test/unitTest/test_rank.cpp
  type surf (line 13) | namespace surf {
    type ranktest (line 15) | namespace ranktest {
      class RankUnitTest (line 21) | class RankUnitTest : public ::testing::Test {
        method SetUp (line 23) | virtual void SetUp () {
        method TearDown (line 32) | virtual void TearDown () {
      function TEST_F (line 110) | TEST_F (RankUnitTest, readBitTest) {
      function TEST_F (line 132) | TEST_F (RankUnitTest, rankTest) {
      function TEST_F (line 141) | TEST_F (RankUnitTest, serializeTest) {
      function loadWordList (line 147) | void loadWordList() {
  function main (line 162) | int main (int argc, char** argv) {

FILE: test/unitTest/test_select.cpp
  type surf (line 13) | namespace surf {
    type selecttest (line 15) | namespace selecttest {
      class SelectUnitTest (line 21) | class SelectUnitTest : public ::testing::Test {
        method SetUp (line 23) | virtual void SetUp () {
        method TearDown (line 31) | virtual void TearDown () {
      function TEST_F (line 87) | TEST_F (SelectUnitTest, readBitTest) {
      function TEST_F (line 102) | TEST_F (SelectUnitTest, selectTest) {
      function TEST_F (line 107) | TEST_F (SelectUnitTest, serializeTest) {
      function loadWordList (line 113) | void loadWordList() {
  function main (line 128) | int main (int argc, char** argv) {

FILE: test/unitTest/test_suffix.cpp
  type surf (line 13) | namespace surf {
    type suffixtest (line 15) | namespace suffixtest {
      class SuffixUnitTest (line 21) | class SuffixUnitTest : public ::testing::Test {
        method SetUp (line 23) | virtual void SetUp () {
        method TearDown (line 27) | virtual void TearDown () {
      function getCommonPrefixLen (line 42) | static int getCommonPrefixLen(const std::string &a, const std::strin...
      function getMax (line 49) | static int getMax(int a, int b) {
      function TEST_F (line 105) | TEST_F (SuffixUnitTest, constructRealSuffixTest) {
      function TEST_F (line 135) | TEST_F (SuffixUnitTest, constructMixedSuffixTest) {
      function TEST_F (line 170) | TEST_F (SuffixUnitTest, checkEqualityTest) {
      function TEST_F (line 214) | TEST_F (SuffixUnitTest, serializeTest) {
      function loadWordList (line 256) | void loadWordList() {
  function main (line 271) | int main (int argc, char** argv) {

FILE: test/unitTest/test_suffix_vector.cpp
  type surf (line 13) | namespace surf {
    type suffixvectortest (line 16) | namespace suffixvectortest {
      class SuffixVectorUnitTest (line 22) | class SuffixVectorUnitTest : public ::testing::Test {
        method SetUp (line 24) | virtual void SetUp () {
        method TearDown (line 27) | virtual void TearDown () {
      function TEST_F (line 36) | TEST_F (SuffixVectorUnitTest, buildNoneTest) {
      function TEST_F (line 44) | TEST_F (SuffixVectorUnitTest, buildHashTest) {
      function TEST_F (line 52) | TEST_F (SuffixVectorUnitTest, buildRealTest) {
      function loadWordList (line 63) | void loadWordList() {
  function main (line 78) | int main (int argc, char** argv) {

FILE: test/unitTest/test_surf.cpp
  type surf (line 12) | namespace surf {
    type surftest (line 14) | namespace surftest {
      class SuRFUnitTest (line 27) | class SuRFUnitTest : public ::testing::Test {
        method SetUp (line 29) | virtual void SetUp () {
        method TearDown (line 34) | virtual void TearDown () {
      function getCommonPrefixLen (line 52) | static int getCommonPrefixLen(const std::string &a, const std::strin...
      function getMax (line 59) | static int getMax(int a, int b) {
      function isEqual (line 65) | static bool isEqual(const std::string& a, const std::string& b, cons...
      function TEST_F (line 158) | TEST_F (SuRFUnitTest, IntStringConvertTest) {
      function TEST_F (line 164) | TEST_F (SuRFUnitTest, lookupWordTest) {
      function TEST_F (line 175) | TEST_F (SuRFUnitTest, serializeTest) {
      function TEST_F (line 185) | TEST_F (SuRFUnitTest, lookupIntTest) {
      function TEST_F (line 202) | TEST_F (SuRFUnitTest, moveToKeyGreaterThanWordTest) {
      function TEST_F (line 295) | TEST_F (SuRFUnitTest, moveToKeyGreaterThanIntTest) {
      function TEST_F (line 338) | TEST_F (SuRFUnitTest, moveToKeyLessThanWordTest) {
      function TEST_F (line 378) | TEST_F (SuRFUnitTest, moveToKeyLessThanIntTest) {
      function TEST_F (line 415) | TEST_F (SuRFUnitTest, IteratorIncrementWordTest) {
      function TEST_F (line 439) | TEST_F (SuRFUnitTest, IteratorIncrementIntTest) {
      function TEST_F (line 463) | TEST_F (SuRFUnitTest, IteratorDecrementWordTest) {
      function TEST_F (line 487) | TEST_F (SuRFUnitTest, IteratorDecrementIntTest) {
      function TEST_F (line 512) | TEST_F (SuRFUnitTest, lookupRangeWordTest) {
      function TEST_F (line 542) | TEST_F (SuRFUnitTest, lookupRangeIntTest) {
      function TEST_F (line 572) | TEST_F (SuRFUnitTest, approxCountWordTest) {
      function TEST_F (line 593) | TEST_F (SuRFUnitTest, approxCountIntTest) {
      function loadWordList (line 614) | void loadWordList() {
  function main (line 629) | int main (int argc, char** argv) {

FILE: test/unitTest/test_surf_builder.cpp
  type surf (line 12) | namespace surf {
    type buildertest (line 14) | namespace buildertest {
      class SuRFBuilderUnitTest (line 22) | class SuRFBuilderUnitTest : public ::testing::Test {
        method SetUp (line 24) | virtual void SetUp () {
      function getCommonPrefixLen (line 51) | static int getCommonPrefixLen(const std::string &a, const std::strin...
      function getMax (line 58) | static int getMax(int a, int b) {
      function printIndent (line 94) | void printIndent(level_t level) {
      function TEST_F (line 319) | TEST_F (SuRFBuilderUnitTest, buildSparseStringTest) {
      function TEST_F (line 332) | TEST_F (SuRFBuilderUnitTest, buildSparseDuplicateTest) {
      function TEST_F (line 345) | TEST_F (SuRFBuilderUnitTest, buildSparseIntTest) {
      function TEST_F (line 358) | TEST_F (SuRFBuilderUnitTest, buildDenseStringTest) {
      function TEST_F (line 371) | TEST_F (SuRFBuilderUnitTest, buildDenseIntTest) {
      function loadWordList (line 384) | void loadWordList() {
  function main (line 401) | int main (int argc, char** argv) {

FILE: test/unitTest/test_surf_small.cpp
  type surf (line 11) | namespace surf {
    type surftest (line 13) | namespace surftest {
      class SuRFSmallTest (line 18) | class SuRFSmallTest : public ::testing::Test {
        method SetUp (line 20) | virtual void SetUp () {}
        method TearDown (line 21) | virtual void TearDown () {}
      function TEST_F (line 24) | TEST_F (SuRFSmallTest, ExampleInPaperTest) {
  function main (line 57) | int main (int argc, char** argv) {

Download .json

Condensed preview — 60 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (3,053K chars).

[
  {
    "path": ".gitignore",
    "chars": 270,
    "preview": "# Prerequisites\n*.d\n\n# Compiled Object files\n*.slo\n*.lo\n*.o\n*.obj\n\n# Precompiled Headers\n*.gch\n*.pch\n\n# Compiled Dynamic"
  },
  {
    "path": ".gitmodules",
    "chars": 91,
    "preview": "[submodule \"ARF\"]\n\tpath = ARF\n\turl = https://github.com/efficient/ARF.git\n\tbranch = master\n"
  },
  {
    "path": ".travis.yml",
    "chars": 669,
    "preview": "language: cpp\nsudo: required\ndist: xenial\ncompiler: gcc\n\ninstall:\n- sudo apt-get install build-essential\n- sudo apt-get "
  },
  {
    "path": "CMakeLists.txt",
    "chars": 926,
    "preview": "cmake_minimum_required (VERSION 2.6)\nproject (SuRF)\n\nmessage(STATUS \"Configuring...\" ${CMAKE_PROJECT_NAME})\n\nif (NOT CMA"
  },
  {
    "path": "CodeCoverage.cmake",
    "chars": 9138,
    "preview": "# Copyright (c) 2012 - 2017, Lars Bilke\n# All rights reserved.\n#\n# Redistribution and use in source and binary forms, wi"
  },
  {
    "path": "LICENSE",
    "chars": 10325,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "README.md",
    "chars": 1921,
    "preview": "# Succinct Range Filter (SuRF)\n[![Build Status](https://travis-ci.org/efficient/SuRF.svg?branch=master)](https://travis-"
  },
  {
    "path": "bench/CMakeLists.txt",
    "chars": 269,
    "preview": "add_executable(workload workload.cpp)\ntarget_link_libraries(workload)\n\nadd_executable(workload_multi_thread workload_mul"
  },
  {
    "path": "bench/MurmurHash3.h",
    "chars": 7658,
    "preview": "//-----------------------------------------------------------------------------\n// MurmurHash3 was written by Austin App"
  },
  {
    "path": "bench/bench.hpp",
    "chars": 3711,
    "preview": "#include <assert.h>\n#include <pthread.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <time.h>\n#i"
  },
  {
    "path": "bench/bloom.hpp",
    "chars": 5518,
    "preview": "// Copyright (c) 2012 The LevelDB Authors. All rights reserved.\n// Use of this source code is governed by a BSD-style li"
  },
  {
    "path": "bench/filter.hpp",
    "chars": 443,
    "preview": "#ifndef FILTER_H_\n#define FILTER_H_\n\n#include <string>\n#include <vector>\n\nnamespace bench {\n\nclass Filter {\npublic:\n    "
  },
  {
    "path": "bench/filter_bloom.hpp",
    "chars": 1114,
    "preview": "#ifndef FILTER_BLOOM_H_\n#define FILTER_BLOOM_H_\n\n#include <string>\n#include <vector>\n\n#include \"bloom.hpp\"\n\nnamespace be"
  },
  {
    "path": "bench/filter_factory.hpp",
    "chars": 1042,
    "preview": "#ifndef FILTER_FACTORY_H_\n#define FILTER_FACTORY_H_\n\n#include \"filter.hpp\"\n#include \"filter_bloom.hpp\"\n#include \"filter_"
  },
  {
    "path": "bench/filter_surf.hpp",
    "chars": 1225,
    "preview": "#ifndef FILTER_SURF_H_\n#define FILTER_SURF_H_\n\n#include <string>\n#include <vector>\n\n#include \"surf.hpp\"\n\nnamespace bench"
  },
  {
    "path": "bench/run.sh",
    "chars": 1512,
    "preview": "#!bin/bash\n\necho 'Bloom Filter, random int, point queries'\n../build/bench/workload Bloom 1 mixed 50 0 randint point zipf"
  },
  {
    "path": "bench/workload.cpp",
    "chars": 9457,
    "preview": "#include \"bench.hpp\"\n#include \"filter_factory.hpp\"\n\nint main(int argc, char *argv[]) {\n    if (argc != 9) {\n\tstd::cout <"
  },
  {
    "path": "bench/workload_arf.cpp",
    "chars": 5380,
    "preview": "#include \"bench.hpp\"\n#include \"ARF.h\"\n#include \"Database.h\"\n#include \"Query.h\"\n\nstatic const int kARFSize = 70000000;\nst"
  },
  {
    "path": "bench/workload_gen/gen_load.py",
    "chars": 3904,
    "preview": "import sys\nimport os\n\nclass bcolors:\n    HEADER = '\\033[95m'\n    OKBLUE = '\\033[94m'\n    OKGREEN = '\\033[92m'\n    WARNIN"
  },
  {
    "path": "bench/workload_gen/gen_txn.py",
    "chars": 3725,
    "preview": "import sys\nimport os\n\nclass bcolors:\n    HEADER = '\\033[95m'\n    OKBLUE = '\\033[94m'\n    OKGREEN = '\\033[92m'\n    WARNIN"
  },
  {
    "path": "bench/workload_gen/gen_workload.sh",
    "chars": 284,
    "preview": "#!bin/bash\n\npython gen_load.py randint uniform\npython gen_txn.py randint uniform\npython gen_txn.py randint zipfian\n#pyth"
  },
  {
    "path": "bench/workload_gen/workload_spec/workload_template",
    "chars": 7245,
    "preview": "# Copyright (c) 2012-2016 YCSB contributors. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (th"
  },
  {
    "path": "bench/workload_gen/workload_spec/workloadc_email_latest",
    "chars": 3044,
    "preview": "# Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                   "
  },
  {
    "path": "bench/workload_gen/workload_spec/workloadc_email_uniform",
    "chars": 3073,
    "preview": "# Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                   "
  },
  {
    "path": "bench/workload_gen/workload_spec/workloadc_email_zipfian",
    "chars": 3045,
    "preview": "# Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                   "
  },
  {
    "path": "bench/workload_gen/workload_spec/workloadc_randint_latest",
    "chars": 3045,
    "preview": "# Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                   "
  },
  {
    "path": "bench/workload_gen/workload_spec/workloadc_randint_uniform",
    "chars": 3074,
    "preview": "# Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                   "
  },
  {
    "path": "bench/workload_gen/workload_spec/workloadc_randint_zipfian",
    "chars": 3046,
    "preview": "# Copyright (c) 2010 Yahoo! Inc. All rights reserved.                                                                   "
  },
  {
    "path": "bench/workload_gen/ycsb_download.sh",
    "chars": 193,
    "preview": "mkdir ../workloads\ncurl -O --location https://github.com/brianfrankcooper/YCSB/releases/download/0.12.0/ycsb-0.12.0.tar."
  },
  {
    "path": "bench/workload_multi_thread.cpp",
    "chars": 8690,
    "preview": "#include \"bench.hpp\"\n#include \"filter_factory.hpp\"\n\n//#define VERBOSE 1\n\nstatic std::vector<std::string> txn_keys;\nstati"
  },
  {
    "path": "include/bitvector.hpp",
    "chars": 5077,
    "preview": "#ifndef BITVECTOR_H_\n#define BITVECTOR_H_\n\n#include <assert.h>\n\n#include <vector>\n\n#include \"config.hpp\"\n\nnamespace surf"
  },
  {
    "path": "include/config.hpp",
    "chars": 1460,
    "preview": "#ifndef CONFIG_H_\n#define CONFIG_H_\n\n#include <stdint.h>\n#include <string.h>\n\nnamespace surf {\n\nusing level_t = uint32_t"
  },
  {
    "path": "include/hash.hpp",
    "chars": 1319,
    "preview": "#ifndef HASH_H_\n#define HASH_H_\n\n#include <string>\n\nnamespace surf {\n\n//************************************************"
  },
  {
    "path": "include/label_vector.hpp",
    "chars": 6246,
    "preview": "#ifndef LABELVECTOR_H_\n#define LABELVECTOR_H_\n\n#include <emmintrin.h>\n\n#include <vector>\n\n#include \"config.hpp\"\n\nnamespa"
  },
  {
    "path": "include/louds_dense.hpp",
    "chars": 21946,
    "preview": "#ifndef LOUDSDENSE_H_\n#define LOUDSDENSE_H_\n\n#include <string>\n\n#include \"config.hpp\"\n#include \"rank.hpp\"\n#include \"suff"
  },
  {
    "path": "include/louds_sparse.hpp",
    "chars": 22415,
    "preview": "#ifndef LOUDSSPARSE_H_\n#define LOUDSSPARSE_H_\n\n#include <string>\n\n#include \"config.hpp\"\n#include \"label_vector.hpp\"\n#inc"
  },
  {
    "path": "include/popcount.h",
    "chars": 7331,
    "preview": "/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */\n#ifndef _FASTRANK_POPCOUNT_H_\n#define _FASTRANK_POPCOU"
  },
  {
    "path": "include/rank.hpp",
    "chars": 3720,
    "preview": "#ifndef RANK_H_\n#define RANK_H_\n\n#include \"bitvector.hpp\"\n\n#include <assert.h>\n\n#include <vector>\n\n#include \"popcount.h\""
  },
  {
    "path": "include/select.hpp",
    "chars": 5189,
    "preview": "#ifndef SELECT_H_\n#define SELECT_H_\n\n#include \"bitvector.hpp\"\n\n#include <assert.h>\n\n#include <vector>\n\n#include \"config."
  },
  {
    "path": "include/suffix.hpp",
    "chars": 8646,
    "preview": "#ifndef SUFFIX_H_\n#define SUFFIX_H_\n\n#include \"bitvector.hpp\"\n\n#include <assert.h>\n\n#include <vector>\n\n#include \"config."
  },
  {
    "path": "include/surf.hpp",
    "chars": 11606,
    "preview": "#ifndef SURF_H_\n#define SURF_H_\n\n#include <string>\n#include <vector>\n\n#include \"config.hpp\"\n#include \"louds_dense.hpp\"\n#"
  },
  {
    "path": "include/surf_builder.hpp",
    "chars": 14921,
    "preview": "#ifndef SURFBUILDER_H_\n#define SURFBUILDER_H_\n\n#include <assert.h>\n\n#include <string>\n#include <vector>\n\n#include \"confi"
  },
  {
    "path": "simple_example.cpp",
    "chars": 3356,
    "preview": "#include <iostream>\n#include <vector>\n\n#include \"include/surf.hpp\"\n\nusing namespace surf;\n\nint main() {\n    std::vector<"
  },
  {
    "path": "src/CMakeLists.txt",
    "chars": 27,
    "preview": "add_library(surf surf.cpp)\n"
  },
  {
    "path": "test/CMakeLists.txt",
    "chars": 320,
    "preview": "find_package(GTest REQUIRED)\ninclude_directories(${GTEST_INCLUDE_DIR})\n\nfunction (add_surf_test file_name )\n  add_execut"
  },
  {
    "path": "test/unitTest/CMakeLists.txt",
    "chars": 722,
    "preview": "find_package(GTest REQUIRED)\ninclude_directories(${GTEST_INCLUDE_DIR})\n\nfunction (add_unit_test file_name)\n  add_executa"
  },
  {
    "path": "test/unitTest/test_bitvector.cpp",
    "chars": 6478,
    "preview": "#include \"gtest/gtest.h\"\n\n#include <assert.h>\n\n#include <fstream>\n#include <string>\n#include <vector>\n\n#include \"bitvect"
  },
  {
    "path": "test/unitTest/test_label_vector.cpp",
    "chars": 8226,
    "preview": "#include \"gtest/gtest.h\"\n\n#include <assert.h>\n\n#include <fstream>\n#include <string>\n#include <vector>\n\n#include \"config."
  },
  {
    "path": "test/unitTest/test_louds_dense.cpp",
    "chars": 14077,
    "preview": "#include \"gtest/gtest.h\"\n\n#include <assert.h>\n\n#include <fstream>\n#include <string>\n#include <vector>\n\n#include \"config."
  },
  {
    "path": "test/unitTest/test_louds_dense_small.cpp",
    "chars": 1748,
    "preview": "#include \"gtest/gtest.h\"\n\n#include <assert.h>\n\n#include <string>\n#include <vector>\n\n#include \"config.hpp\"\n#include \"surf"
  },
  {
    "path": "test/unitTest/test_louds_sparse.cpp",
    "chars": 14588,
    "preview": "#include \"gtest/gtest.h\"\n\n#include <assert.h>\n\n#include <fstream>\n#include <string>\n#include <vector>\n\n#include \"config."
  },
  {
    "path": "test/unitTest/test_louds_sparse_small.cpp",
    "chars": 1584,
    "preview": "#include \"gtest/gtest.h\"\n\n#include <assert.h>\n\n#include <string>\n#include <vector>\n\n#include \"config.hpp\"\n#include \"surf"
  },
  {
    "path": "test/unitTest/test_rank.cpp",
    "chars": 4200,
    "preview": "#include \"gtest/gtest.h\"\n\n#include <assert.h>\n\n#include <fstream>\n#include <string>\n#include <vector>\n\n#include \"config."
  },
  {
    "path": "test/unitTest/test_select.cpp",
    "chars": 3221,
    "preview": "#include \"gtest/gtest.h\"\n\n#include <assert.h>\n\n#include <fstream>\n#include <string>\n#include <vector>\n\n#include \"config."
  },
  {
    "path": "test/unitTest/test_suffix.cpp",
    "chars": 9676,
    "preview": "#include \"gtest/gtest.h\"\n\n#include <assert.h>\n\n#include <fstream>\n#include <string>\n#include <vector>\n\n#include \"config."
  },
  {
    "path": "test/unitTest/test_suffix_vector.cpp",
    "chars": 1939,
    "preview": "#include \"gtest/gtest.h\"\n\n#include <assert.h>\n\n#include <fstream>\n#include <string>\n#include <vector>\n\n#include \"config."
  },
  {
    "path": "test/unitTest/test_surf.cpp",
    "chars": 19966,
    "preview": "#include \"gtest/gtest.h\"\n\n#include <assert.h>\n\n#include <fstream>\n#include <string>\n#include <vector>\n\n#include \"config."
  },
  {
    "path": "test/unitTest/test_surf_builder.cpp",
    "chars": 13068,
    "preview": "#include \"gtest/gtest.h\"\n\n#include <assert.h>\n\n#include <fstream>\n#include <string>\n#include <vector>\n\n#include \"config."
  },
  {
    "path": "test/unitTest/test_surf_small.cpp",
    "chars": 1634,
    "preview": "#include \"gtest/gtest.h\"\n\n#include <assert.h>\n\n#include <string>\n#include <vector>\n\n#include \"config.hpp\"\n#include \"surf"
  },
  {
    "path": "test/words.txt",
    "chars": 2482354,
    "preview": "a\naa\naal\naalii\naam\naani\naardvark\naardwolf\naaron\naaronic\naaronical\naaronite\naaronitic\naaru\nab\naba\nababdeh\nababua\nabac\naba"
  }
]

About this extraction

This page contains the full source code of the efficient/SuRF GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 60 files (2.7 MB), approximately 702.7k tokens, and a symbol index with 376 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo