Repository: cmuparlay/ParlayANN
Branch: main
Commit: 573f3cf67350
Files: 157
Total size: 633.9 KB

Directory structure:
gitextract_81v6usbn/

├── .bazelrc
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── WORKSPACE
├── algorithms/
│   ├── CMakeLists.txt
│   ├── HCNNG/
│   │   ├── CMakeLists.txt
│   │   ├── Makefile
│   │   ├── clusterEdge.h
│   │   ├── hcnng_index.h
│   │   ├── neighbors.h
│   │   └── scripts/
│   │       ├── fashion
│   │       ├── gist_1
│   │       ├── glove100
│   │       ├── glove25
│   │       ├── nytimes
│   │       └── sift
│   ├── HNSW/
│   │   ├── CMakeLists.txt
│   │   ├── HNSW.hpp
│   │   ├── debug.hpp
│   │   ├── dist.hpp
│   │   ├── h5_ops.hpp
│   │   └── type_point.hpp
│   ├── bench/
│   │   ├── BUILD
│   │   ├── IO.h
│   │   ├── MakeBench
│   │   ├── Makefile
│   │   ├── benchUtils.h
│   │   ├── common/
│   │   │   ├── IO.h
│   │   │   ├── MakeBench
│   │   │   ├── MakeBenchLink
│   │   │   ├── atomics.h
│   │   │   ├── dataGen.h
│   │   │   ├── geometry.h
│   │   │   ├── geometryIO.h
│   │   │   ├── get_time.h
│   │   │   ├── glue.h
│   │   │   ├── graph.h
│   │   │   ├── graphIO.h
│   │   │   ├── graphUtils.h
│   │   │   ├── ligraLight.h
│   │   │   ├── parallelDefs
│   │   │   ├── parallelDefsANN
│   │   │   ├── parallelDefs_OMP
│   │   │   ├── parseCommandLine.h
│   │   │   ├── parse_command_line.h
│   │   │   ├── runTests.py
│   │   │   ├── runTestsANN.py
│   │   │   ├── seqDefs
│   │   │   ├── sequenceIO.h
│   │   │   ├── speculative_for.h
│   │   │   ├── time_loop.h
│   │   │   ├── topology.h
│   │   │   └── topology_from_triangles.h
│   │   ├── get_time.h
│   │   ├── neighborsTime.C
│   │   ├── parallelDefsANN
│   │   ├── parse_command_line.h
│   │   └── time_loop.h
│   ├── pyNNDescent/
│   │   ├── CMakeLists.txt
│   │   ├── Makefile
│   │   ├── clusterPynn.h
│   │   ├── neighbors.h
│   │   ├── pynn_index.h
│   │   └── scripts/
│   │       ├── nytimes
│   │       └── sift
│   ├── tutorial.sh
│   ├── utils/
│   │   ├── BUILD
│   │   ├── NSGDist.h
│   │   ├── beamSearch.h
│   │   ├── check_nn_recall.h
│   │   ├── check_range_recall.h
│   │   ├── csvfile.h
│   │   ├── doublingSearch.h
│   │   ├── earlyStopping.h
│   │   ├── euclidian_point.h
│   │   ├── graph.h
│   │   ├── graph_reorder.h
│   │   ├── hashset.h
│   │   ├── jl_point.h
│   │   ├── mips_point.h
│   │   ├── mmap.h
│   │   ├── parse_results.h
│   │   ├── point_range.h
│   │   ├── rangeSearch.h
│   │   ├── simpleGraph.h
│   │   ├── stats.h
│   │   ├── types.h
│   │   └── union.h
│   ├── vamana/
│   │   ├── BUILD
│   │   ├── CMakeLists.txt
│   │   ├── Makefile
│   │   ├── index.h
│   │   ├── index_test.cc
│   │   ├── neighbors.h
│   │   ├── neighbors.sh
│   │   ├── neighbors_test.cc
│   │   └── scripts/
│   │       ├── OpenAIArXiv
│   │       ├── deep10M
│   │       ├── fashion
│   │       ├── gist
│   │       ├── glove100
│   │       ├── glove25
│   │       ├── msmarco_websearch
│   │       ├── nytimes
│   │       ├── sift
│   │       ├── sift100
│   │       ├── space_1
│   │       ├── space_10
│   │       ├── t2i_1
│   │       ├── t2i_10
│   │       └── wikipedia_cohere
│   └── vamanaRange/
│       ├── CMakeLists.txt
│       ├── Makefile
│       ├── index.h
│       └── neighbors.h
├── build/
│   └── _deps/
│       └── parlaylib-subbuild/
│           └── CMakeLists.txt
├── data_tools/
│   ├── Makefile
│   ├── compute_groundtruth.cpp
│   ├── compute_range_groundtruth.cpp
│   ├── crop.cpp
│   ├── random_sample.cpp
│   └── vec_to_bin.cpp
├── docs/
│   ├── README.md
│   ├── algorithms.md
│   ├── data_tools.md
│   ├── quickstart.md
│   └── rangesearch.md
├── python/
│   ├── __init__.py
│   ├── _builder.py
│   ├── _builder.pyi
│   ├── _common.py
│   ├── _files.py
│   ├── big_env.yml
│   ├── builder.cpp
│   ├── compile.sh
│   ├── defaults.py
│   ├── graph_index.cpp
│   ├── module.cpp
│   ├── scripts/
│   │   ├── fashion_test.py
│   │   ├── gist_test.py
│   │   ├── glove100_test.py
│   │   ├── glove25_test.py
│   │   ├── nyt_test.py
│   │   └── sift_test.py
│   ├── sift_test.py
│   ├── test.py
│   └── wrapper.py
└── rangeSearch/
    ├── bench/
    │   ├── .gitignore
    │   ├── IO.h
    │   ├── MakeBench
    │   ├── Makefile
    │   ├── get_time.h
    │   ├── parallelDefsANN
    │   └── rangeTime.C
    └── vamanaRange/
        ├── Makefile
        └── range.h

================================================
FILE CONTENTS
================================================

================================================
FILE: .bazelrc
================================================
# This is from Bazel's former travis setup, to avoid blowing up the RAM usage.
startup --host_jvm_args=-Xmx2500m
startup --host_jvm_args=-Xms2500m
# test --ram_utilization_factor=10 # comment-out for github actions.

# This is so we understand failures better
build --verbose_failures

# This is so we don't use sandboxed execution. Sandboxed execution
# runs stuff in a container, and since Travis already runs its script
# in a container (unless you require sudo in your .travis.yml) this
# fails to run tests.
build --spawn_strategy=standalone --genrule_strategy=standalone
test --test_strategy=standalone

# Below this line, .travis.yml will cat the default bazelrc.
# This is needed so Bazel starts with the base workspace in its
# package path.


# By default build in C++17 mode using the Homegrown scheduler for parallelism.
#build --repo_env=CC=clang++-12
build --repo_env=CC=g++
build --cxxopt=-std=c++17
build --cxxopt=-mcx16        # 16 byte CAS
build --cxxopt=-DHOMEGROWN   # use the homegrown scheduler
build --cxxopt=-DLONG        # use 8 byte vertex identifiers
build --cxxopt=-DAMORTIZEDPD # use amortized_bytepd encoding scheme for compressed graphs
build --cxxopt=-DUSEMALLOC
build --cxxopt=-DPARLAY_USE_STD_ALLOC
build --cxxopt=-pthread      # necessary for homegrown scheduler
build --cxxopt=-march=native
build --cxxopt=-fvisibility=hidden
build --cxxopt=-fvisibility-inlines-hidden
build --cxxopt=-fsized-deallocation  # https://github.com/pybind/pybind11/issues/1604 (for clang)
build -c opt

# C++ warning flags.
build --cxxopt=-Wall
build --cxxopt=-Wextra
build --cxxopt=-Wcast-qual
build --cxxopt=-Wno-unused-parameter
build --cxxopt=-Wpointer-arith
# Turning on -Wshadow rather than just -Wshadow=local would be nice, but the
# codebase currently contains lots of instances of global shadowing.
#build --cxxopt=-Wshadow=local
build --cxxopt=-Wvla

# Build without parallelism.
build:serial --cxxopt=-UHOMEGROWN
build:serial --cxxopt=-DPARLAY_SEQUENTIAL

# Build using CilkPlus for parallelism.
build:cilk --cxxopt=-UHOMEGROWN
build:cilk --cxxopt=-DCILK
build:cilk --cxxopt=-fcilkplus
build:cilk --linkopt=-lcilkrts

# Build using OpenMP for parallelism.
build:openmp --cxxopt=-UHOMEGROWN
build:openmp --cxxopt=-DOPENMP
build:openmp --cxxopt=-fopenmp
build:openmp --linkopt=-fopenmp

# Instruments the build with AddressSanitizer
# (https://github.com/google/sanitizers/wiki/AddressSanitizer).
# Invoke by adding the `--config=asan` flag, e.g.,
#     bazel run --config=asan <build target>`
build:asan --strip=never
build:asan --cxxopt=-fsanitize=address
build:asan --cxxopt=-O1
build:asan --cxxopt=-g
build:asan --cxxopt=-fno-omit-frame-pointer
build:asan --cxxopt=-Wno-macro-redefined
build:asan --linkopt=-fsanitize=address


================================================
FILE: .gitmodules
================================================
[submodule "parlaylib"]
	path = parlaylib
	url = https://github.com/cmuparlay/parlaylib.git


================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.15)
project(PARLAYANN VERSION 1
        DESCRIPTION "ParlayANN is a library of approximate nearest neighbor search algorithms, along with a set of useful tools for designing such algorithms. It is written in C++ and uses parallel primitives from ParlayLib. Currently it includes implementations of the ANNS algorithms DiskANN, HNSW, HCNNG, and pyNNDescent."
        LANGUAGES CXX)

include(CheckCXXCompilerFlag)
include(GNUInstallDirs)

# Set a default build type
if(NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE "RELEASE" CACHE STRING "Build type (Release)" FORCE)
  message(STATUS "No build type specified. Defaulted to RELEASE.")
  message(STATUS "To specify a build type, add -DCMAKE_BUILD_TYPE=<DEBUG/RELEASE/RELWITHDEBINFO/MINSIZEREL>")
endif(NOT CMAKE_BUILD_TYPE)

set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -fno-omit-frame-pointer")
set (CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -march=native")

message(STATUS "PARLAYANN VERSION ${PARLAYANN_VERSION}")
message(STATUS "---------------------------- General configuration -----------------------------")
message(STATUS "CMake Generator:                ${CMAKE_GENERATOR}")
message(STATUS "Compiler:                       ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS "Build type:                     ${CMAKE_BUILD_TYPE}")
message(STATUS "CMAKE_CXX_FLAGS:                ${CMAKE_CXX_FLAGS}")
message(STATUS "CMAKE_CXX_FLAGS_DEBUG:          ${CMAKE_CXX_FLAGS_DEBUG}")
message(STATUS "CMAKE_CXX_FLAGS_RELEASE:        ${CMAKE_CXX_FLAGS_RELEASE}")
message(STATUS "CMAKE_CXX_FLAGS_RELWITHDEBINFO: ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
message(STATUS "CMAKE_EXE_LINKER_FLAGS          ${CMAKE_EXE_LINKER_FLAGS}")
message(STATUS "CMAKE_INSTALL_PREFIX:           ${CMAKE_INSTALL_PREFIX}" )

# Set module path
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

add_library(parlay INTERFACE)
target_include_directories(parlay INTERFACE "${PROJECT_SOURCE_DIR}/parlaylib/include")

# Link against system threads
find_package(Threads REQUIRED)
target_link_libraries(parlay INTERFACE Threads::Threads)

add_subdirectory(algorithms)


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2023 magdalendobson

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: WORKSPACE
================================================
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
load("@bazel_tools//tools/cpp:cc_configure.bzl", "cc_configure")

cc_configure()

http_archive(
    name = "parlaylib",
    sha256 = "68c062ad116fd49d77651d7a24fb985aa66e8ec9ad05176b6af3ab5d29a16b1f",
    strip_prefix = "parlaylib-bazel/include/",
    urls = ["https://github.com/ParAlg/parlaylib/archive/refs/tags/bazel.tar.gz"],
)

http_archive(
    name = "googletest",
    sha256 = "b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab299a031c1034eddc93d5",
    strip_prefix = "googletest-release-1.11.0",
    urls = ["https://github.com/google/googletest/archive/release-1.11.0.tar.gz"],
)


================================================
FILE: algorithms/CMakeLists.txt
================================================
add_subdirectory(HCNNG)
add_subdirectory(HNSW)
add_subdirectory(pyNNDescent)
add_subdirectory(vamana)
add_subdirectory(vamanaRange)


================================================
FILE: algorithms/HCNNG/CMakeLists.txt
================================================
add_executable(neighbors-hcnng ../bench/neighborsTime.C)
  target_link_libraries(neighbors-hcnng PRIVATE parlay)
  target_precompile_headers(neighbors-hcnng PRIVATE neighbors.h)


================================================
FILE: algorithms/HCNNG/Makefile
================================================
include ../bench/parallelDefsANN   

REQUIRE =  ../utils/beamSearch.h hcnng_index.h ../utils/graph.h clusterEdge.h
BENCH = neighbors

include ../bench/MakeBench   


================================================
FILE: algorithms/HCNNG/clusterEdge.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <math.h>

#include <algorithm>
#include <functional>
#include <queue>
#include <random>
#include <set>

#include "../utils/graph.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/random.h"

namespace parlayANN {
  
std::pair<size_t, size_t>
select_two_random(parlay::sequence<size_t> &active_indices,
                  parlay::random &rnd) {
  size_t first_index = rnd.ith_rand(0) % active_indices.size();
  size_t second_index_unshifted = rnd.ith_rand(1) % (active_indices.size() - 1);
  size_t second_index = (second_index_unshifted < first_index)
                            ? second_index_unshifted
                            : (second_index_unshifted + 1);

  return {active_indices[first_index], active_indices[second_index]};
}

template <typename Point, typename PointRange, typename indexType>
struct cluster {
  using distanceType = typename Point::distanceType;
  using edge = std::pair<indexType, indexType>;
  using labelled_edge = std::pair<edge, distanceType>;
  using GraphI = Graph<indexType>;
  using PR = PointRange;

  cluster() {}

  int generate_index(int N, int i) {
    return (N * (N - 1) - (N - i) * (N - i - 1)) / 2;
  }

  template <typename F>
  void recurse(GraphI &G, PR &Points, parlay::sequence<size_t> &active_indices,
               parlay::random &rnd, size_t cluster_size, F f, long MSTDeg,
               indexType first, indexType second) {
    // Split points based on which of the two points are closer.
    auto closer_first =
        parlay::filter(parlay::make_slice(active_indices), [&](size_t ind) {
          distanceType dist_first = Points[ind].distance(Points[first]);
          distanceType dist_second = Points[ind].distance(Points[second]);
          return dist_first <= dist_second;
        });

    auto closer_second =
        parlay::filter(parlay::make_slice(active_indices), [&](size_t ind) {
          distanceType dist_first = Points[ind].distance(Points[first]);
          distanceType dist_second = Points[ind].distance(Points[second]);
          return dist_second < dist_first;
        });

    auto left_rnd = rnd.fork(0);
    auto right_rnd = rnd.fork(1);

    parlay::par_do(
        [&]() {
          random_clustering(G, Points, closer_first, left_rnd, cluster_size, f,
                            MSTDeg);
        },
        [&]() {
          random_clustering(G, Points, closer_second, right_rnd, cluster_size,
                            f, MSTDeg);
        });
  }

  template <typename F>
  void random_clustering(GraphI &G, PR &Points,
                         parlay::sequence<size_t> &active_indices,
                         parlay::random &rnd, size_t cluster_size, F g,
                         long MSTDeg) {
    if (active_indices.size() <= cluster_size)
      g(G, Points, active_indices, MSTDeg);
    else {
      auto [f, s] = select_two_random(active_indices, rnd);
      if (Points[f] == Points[s]) {
        parlay::sequence<size_t> closer_first;
        parlay::sequence<size_t> closer_second;
        for (int i = 0; i < active_indices.size(); i++) {
          if (i < active_indices.size() / 2)
            closer_first.push_back(active_indices[i]);
          else
            closer_second.push_back(active_indices[i]);
        }
        auto left_rnd = rnd.fork(0);
        auto right_rnd = rnd.fork(1);
        parlay::par_do(
            [&]() {
              random_clustering(G, Points, closer_first, left_rnd, cluster_size,
                                g, MSTDeg);
            },
            [&]() {
              random_clustering(G, Points, closer_second, right_rnd,
                                cluster_size, g, MSTDeg);
            });
      } else {
        recurse(G, Points, active_indices, rnd, cluster_size, g, MSTDeg, f, s);
      }
    }
  }

  template <typename F>
  void random_clustering_wrapper(GraphI &G, PR &Points, size_t cluster_size,
                                 F f, long MSTDeg) {
    std::random_device rd;
    std::mt19937 rng(rd());
    std::uniform_int_distribution<int> uni(0, Points.size());
    parlay::random rnd(uni(rng));
    auto active_indices =
        parlay::tabulate(Points.size(), [&](size_t i) { return i; });
    random_clustering(G, Points, active_indices, rnd, cluster_size, f, MSTDeg);
  }

  template <typename F>
  void multiple_clustertrees(GraphI &G, PR &Points, long cluster_size,
                             long num_clusters, F f, long MSTDeg) {
    for (long i = 0; i < num_clusters; i++) {
      random_clustering_wrapper(G, Points, cluster_size, f, MSTDeg);
      std::cout << "Built cluster " << i << " of " << num_clusters << std::endl;
    }
  }
};

} // end namespace


================================================
FILE: algorithms/HCNNG/hcnng_index.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include <math.h>

#include <algorithm>
#include <queue>
#include <random>
#include <set>

#include "../utils/graph.h"
#include "clusterEdge.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/random.h"

namespace parlayANN {

struct DisjointSet {
  parlay::sequence<int> parent;
  parlay::sequence<int> rank;
  size_t N;

  DisjointSet(size_t size) {
    N = size;
    parent = parlay::sequence<int>(N);
    rank = parlay::sequence<int>(N);
    parlay::parallel_for(0, N, [&](size_t i) {
      parent[i] = i;
      rank[i] = 0;
    });
  }

  void _union(int x, int y) {
    int xroot = parent[x];
    int yroot = parent[y];
    int xrank = rank[x];
    int yrank = rank[y];
    if (xroot == yroot)
      return;
    else if (xrank < yrank)
      parent[xroot] = yroot;
    else {
      parent[yroot] = xroot;
      if (xrank == yrank) rank[xroot] = rank[xroot] + 1;
    }
  }

  int find(int x) {
    if (parent[x] == x) return x;
    int c = x;
    while (parent[c] != c) {
      c = parent[c];
    }
    while (x != c) {
      int s = parent[x];
      parent[x] = c;
      x = s;
    }
    return c;
  }

  void flatten() {
    for (int i = 0; i < N; i++) find(i);
  }

  bool is_full() {
    flatten();
    parlay::sequence<bool> truthvals(N);
    parlay::parallel_for(
        0, N, [&](size_t i) { truthvals[i] = (parent[i] == parent[0]); });
    auto ff = [&](bool a) { return not a; };
    auto filtered = parlay::filter(truthvals, ff);
    if (filtered.size() == 0) return true;
    return false;
  }
};

template <typename Point, typename PointRange, typename indexType>
struct hcnng_index {
  using distanceType = typename Point::distanceType;
  using edge = std::pair<indexType, indexType>;
  using labelled_edge = std::pair<edge, distanceType>;
  using pid = std::pair<indexType, distanceType>;
  using GraphI = Graph<indexType>;
  using PR = PointRange;

  static constexpr indexType kNullId = std::numeric_limits<indexType>::max();
  static constexpr distanceType kNullDist =
      std::numeric_limits<distanceType>::max();
  static constexpr labelled_edge kNullEdge = {{kNullId, kNullId}, kNullDist};

  hcnng_index() {}

  static void remove_edge_duplicates(indexType p, GraphI &G) {
    parlay::sequence<indexType> points;
    for (indexType i = 0; i < G[p].size(); i++) {
      points.push_back(G[p][i]);
    }
    auto np = parlay::remove_duplicates(points);
    G[p].update_neighbors(np);
  }

  void remove_all_duplicates(GraphI &G) {
    parlay::parallel_for(0, G.size(),
                         [&](size_t i) { remove_edge_duplicates(i, G); });
  }

  // inserts each edge after checking for duplicates
  static void process_edges(GraphI &G, parlay::sequence<edge> edges) {
    long maxDeg = G.max_degree();
    auto grouped = parlay::group_by_key(edges);
    parlay::parallel_for(0, grouped.size(), [&](size_t i) {
      int32_t index = grouped[i].first;
      for (auto c : grouped[i].second) {
        if (G[index].size() < maxDeg) {
          G[index].append_neighbor(c);
        } else {
          remove_edge_duplicates(index, G);
          G[index].append_neighbor(c);
        }
      }
    });
  }

  // parameters dim and K are just to interface with the cluster tree code
  static void MSTk(GraphI &G, PR &Points,
                   parlay::sequence<size_t> &active_indices, long MSTDeg) {
    // preprocessing for Kruskal's
    size_t N = active_indices.size();
    long dim = Points.dimension();
    DisjointSet disjset(N);
    size_t m = 10;
    auto less = [&](labelled_edge a, labelled_edge b) {
      return a.second < b.second;
    };
    parlay::sequence<labelled_edge> candidate_edges(N * m, kNullEdge);
    parlay::parallel_for(0, N, [&](size_t i) {
      std::priority_queue<labelled_edge, std::vector<labelled_edge>,
                          decltype(less)>
          Q(less);
      for (indexType j = i + 1; j < N; j++) {
        distanceType dist_ij =
            Points[active_indices[i]].distance(Points[active_indices[j]]);
        if (Q.size() >= m) {
          distanceType topdist = Q.top().second;
          if (dist_ij < topdist) {
            labelled_edge e;
            e = std::make_pair(std::make_pair(i, j), dist_ij);
            Q.pop();
            Q.push(e);
          }
        } else {
          labelled_edge e;
          e = std::make_pair(std::make_pair(i, j), dist_ij);
          Q.push(e);
        }
      }
      indexType limit = std::min(Q.size(), m);
      for (indexType j = 0; j < limit; j++) {
        candidate_edges[i * m + j] = Q.top();
        Q.pop();
      }
    });

    parlay::sort_inplace(candidate_edges, less);

    auto degrees =
        parlay::tabulate(active_indices.size(), [&](size_t i) { return 0; });
    parlay::sequence<edge> MST_edges = parlay::sequence<edge>();
    // modified Kruskal's algorithm
    for (indexType i = 0; i < candidate_edges.size(); i++) {
      // Since we sorted, any null edges form the suffix.
      if (candidate_edges[i].second == kNullDist) break;
      labelled_edge e_l = candidate_edges[i];
      edge e = e_l.first;
      if ((disjset.find(e.first) != disjset.find(e.second)) &&
          (degrees[e.first] < MSTDeg) && (degrees[e.second] < MSTDeg)) {
        MST_edges.push_back(
            std::make_pair(active_indices[e.first], active_indices[e.second]));
        MST_edges.push_back(
            std::make_pair(active_indices[e.second], active_indices[e.first]));
        degrees[e.first] += 1;
        degrees[e.second] += 1;
        disjset._union(e.first, e.second);
      }
      if (i % N == 0) {
        if (disjset.is_full()) {
          break;
        }
      }
    }
    process_edges(G, std::move(MST_edges));
  }

  // robustPrune routine as found in DiskANN paper, with the exception that the
  // new candidate set is added to the field new_nbhs instead of directly
  // replacing the out_nbh of p
  void robustPrune(indexType p, PR &Points, GraphI &G, double alpha) {
    // add out neighbors of p to the candidate set.
    parlay::sequence<pid> candidates;
    for (size_t i = 0; i < G[p].size(); i++) {
      candidates.push_back(
          std::make_pair(G[p][i], Points[p].distance(Points[G[p][i]])));
    }

    // Sort the candidate set in reverse order according to distance from p.
    auto less = [&](pid a, pid b) { return a.second < b.second; };
    parlay::sort_inplace(candidates, less);

    parlay::sequence<int> new_nbhs = parlay::sequence<int>();

    size_t candidate_idx = 0;
    while (new_nbhs.size() < G.max_degree() &&
           candidate_idx < candidates.size()) {
      // Don't need to do modifications.
      indexType p_star = candidates[candidate_idx].first;
      candidate_idx++;
      if (p_star == p || p_star == kNullId) continue;

      new_nbhs.push_back(p_star);

      for (size_t i = candidate_idx; i < candidates.size(); i++) {
        indexType p_prime = candidates[i].first;
        if (p_prime != kNullId) {
          distanceType dist_starprime =
              Points[p_star].distance(Points[p_prime]);
          distanceType dist_pprime = candidates[i].second;
          if (alpha * dist_starprime <= dist_pprime)
            candidates[i].first = kNullId;
        }
      }
    }
    G[p].update_neighbors(new_nbhs);
  }

  void build_index(GraphI &G, PR &Points, long cluster_rounds,
                   long cluster_size, long MSTDeg) {
    cluster<Point, PointRange, indexType> C;
    C.multiple_clustertrees(G, Points, cluster_size, cluster_rounds, MSTk,
                            MSTDeg);
    remove_all_duplicates(G);
    // TODO: enable optional pruning (what is below now works, but
    // should be connected cleanly)
    // parlay::parallel_for(0, G.size(), [&] (size_t i){robustPrune(i, Points,
    // G, 1.1);});
  }
};

}  // namespace parlayANN


================================================
FILE: algorithms/HCNNG/neighbors.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include <algorithm>
#include <cmath>
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/random.h"
#include "../utils/NSGDist.h"  
#include "../utils/types.h"
#include "../utils/beamSearch.h"
#include "../utils/stats.h"
#include "../utils/parse_results.h"
#include "../utils/check_nn_recall.h"
#include "../utils/graph.h"
#include "hcnng_index.h"

namespace parlayANN {

template<typename Point, typename PointRange, typename indexType>
void ANN(Graph<indexType> &G, long k, BuildParams &BP,
         PointRange &Query_Points,
         groundTruth<indexType> GT, char *res_file,
         bool graph_built, PointRange &Points) {

  parlay::internal::timer t("ANN"); 
  using findex = hcnng_index<Point, PointRange, indexType>;

  double idx_time;
  if(!graph_built){
    findex I;
    I.build_index(G, Points, BP.num_clusters, BP.cluster_size, BP.MST_deg);
    idx_time = t.next_time();
  } else{idx_time=0;}
  std::string name = "HCNNG";
  std::string params = "Trees = " + std::to_string(BP.num_clusters);
  auto [avg_deg, max_deg] = graph_stats_(G);
  Graph_ G_(name, params, G.size(), avg_deg, max_deg, idx_time);
  G_.print();
  if(Query_Points.size() != 0)
    search_and_parse(G_, G, Points, Query_Points, GT, res_file, k, BP.verbose);
}

} // end namespace


================================================
FILE: algorithms/HCNNG/scripts/fashion
================================================
# bash

numactl -i all ./neighbors -R 64 -L 128 -alpha 1.15 -data_type float -file_type bin -dist_func Euclidian -base_path data/fashion-mnist-784-euclidean/fashion-mnist-784-euclidean_base.fbin -query_path data/fashion-mnist-784-euclidean/fashion-mnist-784-euclidean_query.fbin -gt_path data/fashion-mnist-784-euclidean/fashion-mnist-784-euclidean_groundtruth -quantize 8 -two_pass 1 -graph_path data/fashion-mnist-784-euclidean/graph_64 -verbose 


================================================
FILE: algorithms/HCNNG/scripts/gist_1
================================================
# bash

numactl -i all ./neighbors -R 100 -L 200 -alpha 1.1 -data_type float -file_type bin -dist_func Euclidian -base_path data/gist/gist_base.fbin -query_path data/gist/gist_query.fbin -gt_path data/gist/gist-1M -quantize 16 -two_pass 1 -quantize_build 1 -graph_path data/gist/graph


================================================
FILE: algorithms/HCNNG/scripts/glove100
================================================
# bash

numactl -i all ./neighbors -R 150 -L 300 -alpha 1 -data_type float -file_type bin -dist_func mips -base_path data/glove-100-angular/glove-100-angular_base.fbin -query_path data/glove-100-angular/glove-100-angular_query.fbin -gt_path data/glove-100-angular/glove-100-angular_groundtruth -num_passes 2 -quantize_build 1 -quantize 16 -normalize -verbose -graph_path data/glove-100-angular/graph


================================================
FILE: algorithms/HCNNG/scripts/glove25
================================================
# bash

numactl -i all ./neighbors -R 150 -L 300 -alpha 1 -data_type float -file_type bin -dist_func mips -base_path data/glove-25-angular/glove-25-angular_base.fbin -query_path data/glove-25-angular/glove-25-angular_query.fbin -gt_path data/glove-25-angular/glove-25-angular_groundtruth -num_passes 2 -quantize_build -normalize -verbose


================================================
FILE: algorithms/HCNNG/scripts/nytimes
================================================
# bash
BUILD_ARGS="-cluster_size 1000 -mst_deg 3 -num_clusters 30 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 16 -verbose"
TYPE_ARGS="-data_type float -dist_func mips -normalize -file_type bin"

PATH=data/nytimes-256-angular
DATA_FILE=$PATH/base.fbin
QUERY_FILE=$PATH/query.fbin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_hcnng_1000_3_30

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/HCNNG/scripts/sift
================================================
# bash
BUILD_ARGS="-cluster_size 1000 -mst_deg 3 -num_clusters 30 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 8 -verbose"
TYPE_ARGS="-data_type float -dist_func Euclidian -file_type bin"

PATH=data/sift-128-euclidean
DATA_FILE=$PATH/base.fbin
QUERY_FILE=$PATH/query.fbin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_hcnng_1000_3_30

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/HNSW/CMakeLists.txt
================================================
# add_executable(neighbors-hnsw ../bench/neighborsTime.C)
#   target_link_libraries(neighbors-hnsw PRIVATE parlay)
#   target_precompile_headers(neighbors-hnsw PRIVATE HNSW.hpp)


================================================
FILE: algorithms/HNSW/HNSW.hpp
================================================
#ifndef _HNSW_HPP
#define _HNSW_HPP

#include <cstdint>
#include <cstdio>
#include <cstdarg>
#include <cassert>
#include <cmath>
#include <algorithm>
#include <numeric>
#include <random>
#include <memory>
#include <atomic>
#include <fstream>
#include <string>
#include <vector>
#include <unordered_set>
#include <unordered_map>
#include <queue>
#include <set>
#include <iterator>
#include <type_traits>
#include <limits>
#include <thread>
// #include "parallelize.h"
#include <parlay/parallel.h>
#include <parlay/primitives.h>
#include <parlay/delayed_sequence.h>
#include <parlay/random.h>
#include "debug.hpp"
#include "../utils/beamSearch.h"
#define DEBUG_OUTPUT 0
#if DEBUG_OUTPUT
#define debug_output(...) fprintf(stderr, __VA_ARGS__)
#else
#define debug_output(...) do{[](...){}(__VA_ARGS__);}while(0)
#endif // DEBUG_OUTPUT

namespace ANN{

  using namespace parlayANN;
  
enum class type_metric{
	L2, ANGULAR, DOT
};

struct point{
	float x, y;
};

template<typename U, template<typename> class Allocator=std::allocator>
class HNSW
{
	using T = typename U::type_point;
	typedef uint32_t node_id;
public:
	/*
		Construct from the vectors [begin, end).
		std::iterator_trait<Iter>::value_type ought to be convertible to T
		dim: 				vector dimension
		m_l: 				control the # of levels (larger m_l leads to more layer)
		m: 					max degree
		ef_construction:	beam size during the construction
		alpha:				parameter of the heuristic (similar to the one in vamana)
		batch_base: 		growth rate of the batch size (discarded because of two passes)
	*/
	template<typename Iter>
	HNSW(Iter begin, Iter end, uint32_t dim, float m_l=1, uint32_t m=100, uint32_t ef_construction=50, float alpha=5, float batch_base=2);

	/*
		Construct from the saved model
		getter(i) returns the actual data (convertible to type T) of the vector with id i
	*/
	template<typename G>
	HNSW(const std::string &filename_model, G getter);

	parlay::sequence<std::pair<uint32_t,float>> search(
		const T &q, uint32_t k, uint32_t ef, const search_control &ctrl={}
	);
	// parlay::sequence<std::tuple<uint32_t,uint32_t,float>> search_ex(const T &q, uint32_t k, uint32_t ef, uint64_t verbose=0);
	// save the current model to a file
	void save(const std::string &filename_model) const;
public:
	typedef uint32_t type_index;

	struct node{
		// uint32_t id;
		uint32_t level;
		parlay::sequence<node_id> *neighbors;
		T data;
	};

	struct dist{
		float d;
		node_id u;

		constexpr bool operator<(const dist &rhs) const{
		return d<rhs.d;
		}

		constexpr bool operator>(const dist &rhs) const{
			return d>rhs.d;
		}
	};

	struct dist_ex : dist
	{
		uint32_t depth;
	};

	struct nearest{
		constexpr bool operator()(const dist &lhs, const dist &rhs) const{
			return lhs.d>rhs.d;
		}
	};

	struct farthest{
		constexpr bool operator()(const dist &lhs, const dist &rhs) const{
			return lhs.d<rhs.d;
		}
	};

/*
	struct cmp_id{
		constexpr bool operator()(const dist &lhs, const dist &rhs) const{
			return U::get_id(get_node(lhs.u).data)<U::get_id(get_node(rhs.u).data);
		}
	};
*/
	parlay::sequence<node_id> entrance; // To init
	// auto m, max_m0, m_L; // To init
	uint32_t dim;
	float m_l;
	uint32_t m;
	// uint32_t level_max = 30; // To init
	uint32_t ef_construction;
	float alpha;
	uint32_t n;
	Allocator<node> allocator;
	parlay::sequence<node> node_pool;
	mutable parlay::sequence<size_t> total_visited = parlay::sequence<size_t>(parlay::num_workers());
	mutable parlay::sequence<size_t> total_eval = parlay::sequence<size_t>(parlay::num_workers());
	mutable parlay::sequence<size_t> total_size_C = parlay::sequence<size_t>(parlay::num_workers());
	mutable parlay::sequence<size_t> total_range_candidate = parlay::sequence<size_t>(parlay::num_workers());

	static auto neighbourhood(node &u, uint32_t level)
		-> parlay::sequence<node_id>&
	{
		// const constexpr auto level_none = std::numeric_limits<uint32_t>::max();
		// return level==level_none? u.final_nbh: u.neighbors[level];
		// return level==0? u.final_nbh: u.neighbors[level];
		return u.neighbors[level];
	}

	static auto neighbourhood(const node &u, uint32_t level)
		-> const parlay::sequence<node_id>&
	{
		return neighbourhood(const_cast<node&>(u),level);
	}

	node& get_node(node_id id)
	{
		return node_pool[id];
	}

	const node& get_node(const node_id id) const
	{
		return node_pool[id];
	}

/*
	static void add_connection(parlay::sequence<node_id> &neighbors, node &u, uint32_t level)
	{
		for(auto pv : neighbors)
		{
			assert(&u!=pv);
			pv->neighbors[level].push_back(&u);
			u.neighbors[level].push_back(pv);
		}
	}
*/
	class dist_evaluator{
		using point_t = T;
		using dist_t = float;

		std::reference_wrapper<const point_t> p;
		uint32_t dim;
	public:
		dist_evaluator(const point_t &p, uint32_t dim) :
			p(p), dim(dim){
		}
		dist_t operator()(const point_t &pv) const{
			return U::distance(p, pv, dim);
		}
		dist_t operator()(const point_t &pu, const point_t &pv) const{
			return U::distance(pu, pv, dim);
		}
	};

	struct graph{
		template<class Nbh>
		struct edgeRange{
			edgeRange(Nbh &nbh) : nbh(nbh){
			}
			decltype(auto) operator[](node_id pu) const{
				return nbh.get()[pu];
			}
			auto size() const{
				return nbh.get().size();
			}
			void prefetch() const{
				int l = (size() * sizeof(node_id))/64;
				for (int i=0; i < l; i++)
					__builtin_prefetch((char*) nbh.get().data() + i*64);
			}

			std::reference_wrapper<Nbh> nbh;
		};

		using nid_t = node_id;

		graph(const HNSW<U,Allocator> &hnsw, uint32_t l) :
			hnsw(hnsw), l(l){
		}

		decltype(auto) num_nodes() const{
			return hnsw.get().n;
		}
		decltype(auto) get_node(node_id pu) const{
			return hnsw.get().get_node(pu);
		}
		decltype(auto) get_edges(node_id pu){
			return hnsw.get().neighbourhood(hnsw.get().get_node(pu),l);
		}
		decltype(auto) get_edges(node_id pu) const{
			return hnsw.get().neighbourhood(hnsw.get().get_node(pu),l);
		}

		uint32_t max_degree() const{
			return hnsw.get().get_threshold_m(l);
		}

		auto operator[](node_id pu){
			return edgeRange(get_edges(pu));
		}
		auto operator[](node_id pu) const{
			return edgeRange(get_edges(pu));
		}

		std::reference_wrapper<const HNSW<U,Allocator>> hnsw;
		uint32_t l;
	};

	// node* insert(const T &q, uint32_t id);
	template<typename Iter>
	void insert(Iter begin, Iter end, bool from_blank);

	template<typename Queue>
	void select_neighbors_simple_impl(const T &u, Queue &C, uint32_t M)
	{
		/*
		list res;
		for(uint32_t i=0; i<M; ++i)
		{
			res.insert(C.pop_front());
		}
		return res;
		*/
		(void)u;
		parlay::sequence<typename Queue::value_type> tie;
		float dist_tie = 1e20;
		while(C.size()>M)
		{
			const auto &t = C.top();
			if(t.d+1e-6<dist_tie) // t.d<dist_tie
			{
				dist_tie = t.d;
				tie.clear();
			}
			if(fabs(dist_tie-t.d)<1e-6) // t.d==dist_tie
				tie.push_back(t);
			C.pop();
		}
		if(fabs(dist_tie-C.top().d)<1e-6) // C.top().d==dist_tie
			while(!tie.empty())
			{
			//	C.push({dist_tie,tie.back()});
				C.push(tie.back());
				tie.pop_back();
			}
	}

	template<typename Queue>
	auto select_neighbors_simple(const T &u, const Queue &C, uint32_t M)
	{
		// The parameter C is intended to be copy constructed
		/*
		select_neighbors_simple_impl(u, C, M);
		return C;
		*/
		// auto R = parlay::sort(C, farthest());
		auto R = C;
		
		if(R.size()>M)
		{
			std::nth_element(R.begin(), R.begin()+M, R.end(), farthest());
			R.resize(M);
		}
		
		std::sort(R.begin(), R.end(), farthest());
		// if(R.size()>M) R.resize(M);
		/*
		uint32_t size_R = std::min(C.size(),M);
		parlay::sequence<node*> R;
		R.reserve(size_R);
		for(const auto &e : C)
			R.push_back(e.u);
		*/

		return R;
	}

	// To optimize
	auto select_neighbors_heuristic(const T &u, 
		/*const std::priority_queue<dist,parlay::sequence<dist>,farthest> &C*/
		const parlay::sequence<dist> &C, uint32_t M,
		uint32_t level, bool extendCandidate, bool keepPrunedConnections)
	{
		(void)extendCandidate;

		// std::priority_queue<dist,parlay::sequence<dist>,farthest> C_cp=C, W_d;
		parlay::sequence<dist> W_d;
		std::set<node_id> W_tmp;
		// while(!C_cp.empty())
		for(auto &e : C) // TODO: add const?
		{
			// auto &e = C_cp.top();
			W_tmp.insert(e.u);
			if(extendCandidate)
			{
				for(node_id e_adj : neighbourhood(get_node(e.u),level))
				{
					// if(e_adj==nullptr) continue; // TODO: check
					if(W_tmp.find(e_adj)==W_tmp.end())
						W_tmp.insert(e_adj);
				}
			}
			// C_cp.pop();
		}

		// std::priority_queue<dist,parlay::sequence<dist>,nearest> W;
		parlay::sequence<dist> W;
		W.reserve(W_tmp.size());
		for(node_id p : W_tmp)
			W.push_back({U::distance(get_node(p).data,u,dim), p});
		std::sort(W.begin(), W.end(), farthest());
		/*
		for(auto &e : W_tmp)
			W.push(e);
		*/
		W_tmp.clear();

		parlay::sequence<node_id> R;
		std::set<node_id> nbh;
		// while(W.size()>0 && R.size()<M)
		for(const auto &e : W)
		{
			if(R.size()>=M) break;
			// const auto e = W.top();
			// W.pop();
			const auto d_q = e.d;

			bool is_good = true;
			for(const auto &r : R)
			{
				const auto d_r = U::distance(get_node(e.u).data, get_node(r).data, dim);
				//if(d_r*(level+1)>d_q*alpha*(entrance->level+1))
				if(d_r<d_q*alpha)
				{
					is_good = false;
					break;
				}
				/*
				for(auto *pv : neighbourhood(*e.u,level))
					if(pv==e.u)
					{
						is_good = false;
						break;
					}
				*/
				/*
				if(nbh.find(e.u)!=nbh.end())
					is_good = false;
				*/
			}

			if(is_good)
			{
				R.push_back(e.u);
				/*				
				for(auto *pv : neighbourhood(*e.u,level))
					nbh.insert(pv);
				*/
			}
			else
				W_d.push_back(e);
		}

		// std::sort(W_d.begin(), W_d.end(), nearest());
		auto it = W_d.begin();
		// std::priority_queue<dist,parlay::sequence<dist>,farthest> res;
		auto &res = R;
		/*
		for(const auto &r : R)
		{
			res.push({U::distance(u,get_node(r).data,dim), r});
		}
		*/
		if(keepPrunedConnections)
		{
			// while(W_d.size()>0 && res.size()<M)
				// res.push(W_d.top()), W_d.pop();
			while(it!=W_d.end() && res.size()<M)
				// res.push(*(it++));
				res.push_back((it++)->u);
		}
		return res;
	}

	template<class Seq_, class D, class G, class Seq=std::remove_cv_t<std::remove_reference_t<Seq_>>>
	Seq prune_heuristic(
		Seq_ &&cand, uint32_t size, D f_dist, G g) const
	{
		using nid_t = node_id;
		using conn = dist;

		Seq workset = std::forward<Seq_>(cand);
		/*
		if(ctrl.extend_nbh)
		{
			const auto &g = ctrl.graph;
			std::unordered_set<nid_t> cand_ext;
			for(const conn &c : workset)
			{
				cand_ext.insert(c.u);
				for(nid_t pv : g.get_edges(c.u))
					cand_ext.insert(pv);
			}

			workset.reserve(workset.size()+cand_ext.size());
			for(nid_t pc : cand_ext)
				workset.push_back({f_dist(g.get_node(pc).get_coord()), pc});
			cand_ext.clear();
		}
		*/
		parlay::sort_inplace(workset);

		Seq res, pruned;
		std::unordered_set<nid_t> nbh;
		for(conn &c : workset)
		{
			const auto d_cu = c.d*alpha;

			bool is_pruned = false;
			for(const conn &r : res)
			{
				const auto d_cr = f_dist(
					g.get_node(c.u).data,
					g.get_node(r.u).data
				);
				if(d_cr<d_cu)
				{
					is_pruned = true;
					break;
				}
			}

			if(!is_pruned)
			{
				res.push_back(std::move(c));
				if(res.size()==size) break;
			}
			else pruned.push_back(std::move(c));
		}
		return res;
	}

	auto select_neighbors(const T &u, 
		/*const std::priority_queue<dist,parlay::sequence<dist>,farthest> &C,*/
		const parlay::sequence<dist> &C, uint32_t M,
		uint32_t level, bool extendCandidate=false, bool keepPrunedConnections=false)
	{
		/*
		(void)level, (void)extendCandidate, (void)keepPrunedConnections;
		return select_neighbors_simple(u,C,M);
		*/
		// return select_neighbors_heuristic(u, C, M, level, extendCandidate, keepPrunedConnections);

		dist_evaluator f_dist(u, dim);
		graph g(*this, level);
		auto res = prune_heuristic(C, M, f_dist, g);
		return parlay::tabulate(res.size(), [&](size_t i){return res[i].u;});
	}

	uint32_t get_level_random()
	{
		// static thread_local int32_t anchor;
		// uint32_t esp;
		// asm volatile("movl %0, %%esp":"=a"(esp));
		// static thread_local std::hash<std::thread::id> h;
		// static thread_local std::mt19937 gen{h(std::this_thread::get_id())};
		static thread_local std::mt19937 gen{parlay::worker_id()};
		static thread_local std::uniform_real_distribution<> dis(std::numeric_limits<float>::min(), 1.0);
		const uint32_t res = uint32_t(-log(dis(gen))*m_l);
		return res;
	}

	// auto search_layer(const node &u, const parlay::sequence<node_id> &eps, uint32_t ef, uint32_t l_c, uint64_t verbose=0) const; // To static
	auto search_layer(const node &u, const parlay::sequence<node_id> &eps, uint32_t ef, uint32_t l_c, search_control ctrl={}) const; // To static
	auto search_layer_bak(const node &u, const parlay::sequence<node_id> &eps, uint32_t ef, uint32_t l_c, search_control ctrl={}) const; // To static
	auto search_layer_new_ex(const node &u, const parlay::sequence<node_id> &eps, uint32_t ef, uint32_t l_c, search_control ctrl={}) const; // To static
	auto beam_search_ex(const node &u, const parlay::sequence<node_id> &eps, uint32_t beamSize, uint32_t l_c, search_control ctrl={}) const;
	parlay::sequence<node_id> search_layer_to(
		const node &u, uint32_t ef, uint32_t l_stop, const search_control &ctrl={}
	);

	auto get_threshold_m(uint32_t level) const{
		return level==0? m*2: m;
		// (void)level;
		// return m;
	}

public:
	auto get_deg(uint32_t level=0)
	{
		parlay::sequence<uint32_t> res;
		res.reserve(node_pool.size());
		for(const node &e : node_pool)
		{
			if(e.level>=level)
				res.push_back(e.neighbors[level].size());
		}
		return res;
	}

	auto get_indeg(uint32_t level) const
	{
		static uint32_t *indeg[16] = {nullptr};
		auto *&res = indeg[level];
		if(!res)
		{
			res = new uint32_t[n];
			for(uint32_t i=0; i<n; ++i)
				res[i] = 0;
			for(const node_id pu : node_pool)
			{
				if(get_node(pu).level<level) continue;
				for(const node_id pv : get_node(pu).neighbors[level])
					res[U::get_id(get_node(pv).data)]++;
			}
		}
		return res;
	}

	uint32_t get_height() const
	{
		return get_node(entrance[0]).level;
	}

	size_t cnt_degree(uint32_t l) const
	{
		auto cnt_each = parlay::delayed_seq<size_t>(n, [&](size_t i){
			node_id pu = i;
			return get_node(pu).level<l? 0:
				neighbourhood(get_node(pu),l).size();
		});
		return parlay::reduce(cnt_each, parlay::addm<size_t>());
	}

	size_t cnt_vertex(uint32_t l) const
	{
		auto cnt_each = parlay::delayed_seq<size_t>(n, [&](size_t i){
			node_id pu = i;
			return get_node(pu).level<l? 0: 1;
		});
		return parlay::reduce(cnt_each, parlay::addm<size_t>());
	}

	size_t get_degree_max(uint32_t l) const
	{
		auto cnt_each = parlay::delayed_seq<size_t>(n, [&](size_t i){
			node_id pu = i;
			return get_node(pu).level<l? 0:
				neighbourhood(get_node(pu),l).size();
		});
		return parlay::reduce(cnt_each, parlay::maxm<size_t>());
	}
/*
	void debug_output_graph(uint32_t l)
	{
		// return;
		debug_output("Printing the graph at level %u\n", l);
		auto node_exist = parlay::pack(
			node_pool,
			parlay::delayed_seq<bool>(node_pool.size(),[&](size_t i){
				return node_pool[i]->level>=l;
			})
		);
		const auto num_vertices = node_exist.size();
		const auto num_edges = parlay::reduce(
			parlay::delayed_seq<uint64_t>(node_exist.size(),[&](size_t i){
				return node_exist[i]->neighbors[l].size();
			}),
			parlay::addm<uint64_t>{}
		);
		debug_output("# vertices: %lu, # edges: %llu\n", num_vertices, num_edges);

		for(node_id pu : node_exist)
		{
			debug_output("node_id: %u\n", U::get_id(get_node(pu).data));
			// if(!res[i]) continue;
			debug_output("\tneighbors:");
			for(node_id pv : neighbourhood(get_node(pu),l))
				debug_output(" %u", U::get_id(get_node(pv).data));
			debug_output("\n");
		}
	}
*/
};

template<typename U, template<typename> class Allocator>
template<typename G>
HNSW<U,Allocator>::HNSW(const std::string &filename_model, G getter)
{
	std::ifstream model(filename_model, std::ios::binary);
	if(!model.is_open())
		throw std::runtime_error("Failed to open the model");

	const auto size_buffer = 1024*1024*1024; // 1G
	auto buffer = std::make_unique<char[]>(size_buffer);
	model.rdbuf()->pubsetbuf(buffer.get(), size_buffer);

	auto read = [&](auto &data, auto ...args){
		auto read_impl = [&](auto &f, auto &data, auto ...args){
			using T = std::remove_reference_t<decltype(data)>;
			if constexpr(std::is_pointer_v<std::decay_t<T>>)
			{
				auto read_array = [&](auto &data, size_t size, auto ...args){
					for(size_t i=0; i<size; ++i)
						f(f, data[i], args...);
				};
				// use the array extent as the size
				if constexpr(sizeof...(args)==0 && std::is_array_v<T>)
				{
					read_array(data, std::extent_v<T>);
				}
				else
				{
					static_assert(sizeof...(args), "size was not provided");
					read_array(data, args...);
				}
			}
			else
			{
				static_assert(std::is_standard_layout_v<T>);
				model.read((char*)&data, sizeof(data));
			}
		};
		read_impl(read_impl, data, args...);
	};

	char model_type[5] = {'\000'};
	read(model_type, 4);
	if(strcmp(model_type,"HNSW"))
		throw std::runtime_error("Wrong type of model");
	uint32_t version;
	read(version);
	if(version!=3)
		throw std::runtime_error("Unsupported version");

	size_t code_U, size_node;
	read(code_U);
	read(size_node);
	fprintf(stderr, "U type loading %s\n", typeid(U).name());
	// if((typeid(U).hash_code()^sizeof(U))!=code_U)
		// throw std::runtime_error("Inconsistent type `U`");
	// if(sizeof(node)!=size_node)
		// throw std::runtime_error("Inconsistent type `node`");

	// read parameter configuration
	read(dim);
	read(m_l);
	read(m);
	read(ef_construction);
	read(alpha);
	read(n);
	puts("Configuration loaded");
	printf("dim = %u\n", dim);
	printf("m_l = %f\n", m_l);
	printf("m = %u\n", m);
	printf("efc = %u\n", ef_construction);
	printf("alpha = %f\n", alpha);
	printf("n = %u\n", n);
	// read indices
	// std::unordered_map<uint32_t,node*> addr;
	node_pool.resize(n);
	for(uint32_t i=0; i<n; ++i)
	{
		// auto *u = new node;
		node &u = get_node(i);
		read(u.level);
		uint32_t id_u; // TODO: use generic type
		read(id_u);
		u.data = getter(id_u);
		// addr[id_u] = u;
	}
	for(node &u : node_pool)
	{
		u.neighbors = new parlay::sequence<node_id>[u.level+1];
		for(uint32_t l=0; l<=u.level; ++l)
		{
			size_t size;
			read(size);
			auto &nbh_u = u.neighbors[l];
			nbh_u.reserve(size);
			for(size_t i=0; i<size; ++i)
			{
				uint32_t id_v;
				read(id_v);
				nbh_u.push_back(id_v);
			}
		}
	}
	// read entrances
	size_t size;
	read(size);
	entrance.reserve(size);
	for(size_t i=0; i<size; ++i)
	{
		uint32_t id_u;
		read(id_u);
		entrance.push_back(id_u);
	}
}

template<typename U, template<typename> class Allocator>
template<typename Iter>
HNSW<U,Allocator>::HNSW(Iter begin, Iter end, uint32_t dim_, float m_l_, uint32_t m_, uint32_t ef_construction_, float alpha_, float batch_base)
	: dim(dim_), m_l(m_l_), m(m_), ef_construction(ef_construction_), alpha(alpha_), n(std::distance(begin,end))
{
	static_assert(std::is_same_v<typename std::iterator_traits<Iter>::value_type, T>);
	static_assert(std::is_base_of_v<
		std::random_access_iterator_tag, typename std::iterator_traits<Iter>::iterator_category>);

	if(n==0) return;

	std::random_device rd;
	auto perm = parlay::random_permutation<uint32_t>(n, rd());
	auto rand_seq = parlay::delayed_seq<T>(n, [&](uint32_t i){
		//return *(begin+perm[i]);
		return *(begin+i);
	});

	const auto level_ep = get_level_random();
	// node *entrance_init = allocator.allocate(1);
	// node_pool.push_back(entrance_init);
	node_pool.resize(1);
	node_id entrance_init = 0;
	new(&get_node(entrance_init)) node{
		level_ep, 
		new parlay::sequence<node_id>[level_ep+1], 
		*rand_seq.begin()
		/*anything else*/
	};
	entrance.push_back(entrance_init);

	uint32_t batch_begin=0, batch_end=1, size_limit=n*0.02;
	float progress = 0.0;
	while(batch_end<n)
	{
		batch_begin = batch_end;
		batch_end = std::min({n, (uint32_t)std::ceil(batch_begin*batch_base)+1, batch_begin+size_limit});
		/*
		if(batch_end>batch_begin+100)
			batch_end = batch_begin+100;
		*/
		// batch_end = batch_begin+1;

		insert(rand_seq.begin()+batch_begin, rand_seq.begin()+batch_end, true);
		// insert(rand_seq.begin()+batch_begin, rand_seq.begin()+batch_end, false);

		if(batch_end>n*(progress+0.05))
		{
			progress = float(batch_end)/n;
			fprintf(stderr, "Built: %3.2f%%\n", progress*100);
			// fprintf(stderr, "# visited: %lu\n", parlay::reduce(total_visited,parlay::addm<size_t>{}));
			// fprintf(stderr, "# eval: %lu\n", parlay::reduce(total_eval,parlay::addm<size_t>{}));
			// fprintf(stderr, "size of C: %lu\n", parlay::reduce(total_size_C,parlay::addm<size_t>{}));
		}
	}

	// fprintf(stderr, "# visited: %lu\n", parlay::reduce(total_visited,parlay::addm<size_t>{}));
	// fprintf(stderr, "# eval: %lu\n", parlay::reduce(total_eval,parlay::addm<size_t>{}));
	// fprintf(stderr, "size of C: %lu\n", parlay::reduce(total_size_C,parlay::addm<size_t>{}));
	fprintf(stderr, "Index built\n");

	#if 0
		for(const auto *pu : node_pool)
		{
			fprintf(stderr, "[%u] (%.2f,%.2f)\n", U::get_id(get_node(pu).data), get_node(pu).data[0], get_node(pu).data[1]);
			for(int32_t l=pu->level; l>=0; --l)
			{
				fprintf(stderr, "\tlv. %d:", l);
				for(const auto *k : pu->neighbors[l])
					fprintf(stderr, " %u", U::get_id(get_node(k).data));
				fputs("\n", stderr);
			}
		}
	#endif
/*
	for(uint32_t l=0; l<entrance[0]->level; ++l)
		debug_output_graph(l);
*/
}

template<typename U, template<typename> class Allocator>
template<typename Iter>
void HNSW<U,Allocator>::insert(Iter begin, Iter end, bool from_blank)
{
	const auto level_ep = get_node(entrance[0]).level;
	const auto size_batch = std::distance(begin,end);
	auto node_new = std::make_unique<node_id[]>(size_batch);
	auto nbh_new = std::make_unique<parlay::sequence<node_id>[]>(size_batch);
	auto eps = std::make_unique<parlay::sequence<node_id>[]>(size_batch);
	//const float factor_m = from_blank? 0.5: 1;
	const auto factor_m = 1;

	debug_output("Insert %lu elements; from blank? [%c]\n", size_batch, "NY"[from_blank]);

	// auto *pool = allocator.allocate(size_batch);
	// first, query the nearest point as the starting point for each node to insert
	if(from_blank)
	{
		auto offset = node_pool.size();
		node_pool.resize(offset+size_batch);
	parlay::parallel_for(0, size_batch, [&](uint32_t i){
		const T &q = *(begin+i);
		const auto level_u = get_level_random();
		// auto *const pu = &pool[i];		// TODO: add pointer manager
		node_id pu = offset+i;

		new(&get_node(pu)) node{
			level_u,
			new parlay::sequence<node_id>[level_u+1],
			q
		};
		node_new[i] = pu;
	});
	}
	else
	{
	parlay::parallel_for(0, size_batch, [&](uint32_t i){
		node_new[i] = node_pool.size()-size_batch+i;
	});
	}

	debug_output("Nodes are settled\n");
	// TODO: merge ops
	parlay::parallel_for(0, size_batch, [&](uint32_t i){
		auto &u = get_node(node_new[i]);
		const auto level_u = u.level;
		auto &eps_u = eps[i]; 
		// eps_u.push_back(entrance);
		eps_u = entrance;
		for(uint32_t l=level_ep; l>level_u; --l)
		{
			const auto res = search_layer(u, eps_u, 1, l); // TODO: optimize
			eps_u.clear();
			eps_u.push_back(res[0].u);
		}
	});

	debug_output("Finish searching entrances\n");
	// then we process them layer by layer (from high to low)
	for(int32_t l_c=level_ep; l_c>=0; --l_c) // TODO: fix the type
	{
		parlay::sequence<parlay::sequence<std::pair<node_id,node_id>>> edge_add(size_batch);

		debug_output("Finding neighbors on lev. %d\n", l_c);
		parlay::parallel_for(0, size_batch, [&](uint32_t i){
			node_id pu = node_new[i];
			auto &u = get_node(pu);
			if((uint32_t)l_c>u.level) return;

			auto &eps_u = eps[i]; // TODO: check
			auto res = search_layer(u, eps_u, ef_construction, l_c);
			auto neighbors_vec = select_neighbors(u.data, res, get_threshold_m(l_c)/**factor_m*/, l_c);
			// move the content from `neighbors_vec` to `u.neighbors[l_c]`
			// auto &nbh_u = nbh_new[i];
			auto &edge_u = edge_add[i];
			// nbh_u.clear();
			edge_u.clear();
			// nbh_u.reserve(neighbors_vec.size());
			edge_u.reserve(neighbors_vec.size());
			/*
			for(uint32_t j=0; neighbors_vec.size()>0; ++j)
			{
				auto *pv = neighbors_vec.top().u;
				neighbors_vec.pop();
				// nbh_u[j] = pv;
				// edge_u[j] = std::make_pair(pv, &u);
				nbh_u.push_back(pv);
				edge_u.emplace_back(pv, &u);
			}
			*/
			for(node_id pv : neighbors_vec)
				edge_u.emplace_back(pv, pu);
			nbh_new[i] = std::move(neighbors_vec);

			eps_u.clear();
			/*
			while(res.size()>0)
			{
				eps_u.push_back(res.top().u); // TODO: optimize
				res.pop();
			}
			*/
			eps_u.reserve(res.size());
			for(const auto e : res)
				eps_u.push_back(e.u);
		});

		debug_output("Adding forward edges\n");
		parlay::parallel_for(0, size_batch, [&](uint32_t i){
			auto &u = get_node(node_new[i]);
			if((uint32_t)l_c<=u.level)
				neighbourhood(u,l_c) = std::move(nbh_new[i]);
		});

		debug_output("Adding reverse edges\n");
		// now we add edges in the other direction
		auto edge_add_flatten = parlay::flatten(edge_add);
		auto edge_add_grouped = parlay::group_by_key(edge_add_flatten);

		parlay::parallel_for(0, edge_add_grouped.size(), [&](size_t j){
			node_id pv = edge_add_grouped[j].first;
			auto &nbh_v = neighbourhood(get_node(pv),l_c);
			auto &nbh_v_add = edge_add_grouped[j].second;

			// std::unordered_set<node_id> hash_table(nbh_v.begin(),nbh_v.end());
			/*
			for(auto it=nbh_v_add.begin(); it!=nbh_v_add.end();)
			{
				bool is_extant = *it==pv||std::find_if(nbh_v.begin(), nbh_v.end(), [&](const node_id pu_extant){
					return *it==pu_extant;
				})!=nbh_v.end();
				
				// bool is_extant = hash_table.find(*it)!=hash_table.end();
				it = is_extant? nbh_v_add.erase(it): std::next(it);
			}
			*/

			const uint32_t size_nbh_total = nbh_v.size()+nbh_v_add.size();

			const auto m_s = get_threshold_m(l_c)*factor_m;
			if(size_nbh_total>m_s)
			{
				auto candidates = parlay::sequence<dist>(size_nbh_total);
				for(size_t k=0; k<nbh_v.size(); ++k)
					candidates[k] = dist{U::distance(get_node(nbh_v[k]).data,get_node(pv).data,dim), nbh_v[k]};
				for(size_t k=0; k<nbh_v_add.size(); ++k)
					candidates[k+nbh_v.size()] = dist{U::distance(get_node(nbh_v_add[k]).data,get_node(pv).data,dim), nbh_v_add[k]};

				std::sort(candidates.begin(), candidates.end(), farthest());

				nbh_v.resize(m_s);
				for(size_t k=0; k<m_s; ++k)
					nbh_v[k] = candidates[k].u;
				/*
				auto res = select_neighbors(get_node(pv).data, candidates, m_s, l_c);
				nbh_v.clear();
				for(auto *pu : res)
					nbh_v.push_back(pu);
				*/
				// nbh_v = select_neighbors(get_node(pv).data, candidates, m_s, l_c);
			}
			else nbh_v.insert(nbh_v.end(),nbh_v_add.begin(), nbh_v_add.end());
		});
	}

	debug_output("Updating entrance\n");
	// finally, update the entrance
	node_id node_highest = *std::max_element(
		node_new.get(), node_new.get()+size_batch, [&](const node_id u, const node_id v){
			return get_node(u).level < get_node(v).level;
	});
	if(get_node(node_highest).level>level_ep)
	{
		entrance.clear();
		entrance.push_back(node_highest);
		debug_output("New entrance [%u] at lev %u\n", U::get_id(get_node(node_highest).data), get_node(node_highest).level);
	}
	else if(get_node(node_highest).level==level_ep)
	{
		entrance.push_back(node_highest);
		debug_output("New entrance [%u] at lev %u\n", U::get_id(get_node(node_highest).data), get_node(node_highest).level);
	}

	// and add new nodes to the pool
	/*
	if(from_blank)
	node_pool.insert(node_pool.end(), node_new.get(), node_new.get()+size_batch);
	*/
}

template<class Conn, class G, class D, class Seq>
auto beamSearch(
	const G &g, D f_dist, const Seq &eps, uint32_t ef, const search_control &ctrl={})
{
	using nid_t = typename G::nid_t;
	using conn = Conn;

	const auto n = g.num_nodes();
	const uint32_t bits = ef>2? std::ceil(std::log2(ef))*2-2: 2;
	const uint32_t mask = (1u<<bits)-1;
	Seq visited(mask+1, n+1);
	uint32_t cnt_visited = 0;
	parlay::sequence<conn> workset;
	std::set<conn> cand; // TODO: test dual heaps
	std::unordered_set<nid_t> is_inw; // TODO: test merge instead
	// TODO: get statistics about the merged size
	// TODO: switch to the alternative if exceeding a threshold
	workset.reserve(ef+1);

	// debug_output("look at eps\n");
	for(nid_t pe : eps)
	{
		visited[parlay::hash64(pe)&mask] = pe;
		const auto d = f_dist(g.get_node(pe).data);
		cand.insert({d,pe});
		workset.push_back({d,pe});
		is_inw.insert(pe);
		cnt_visited++;
	}
	std::make_heap(workset.begin(), workset.end());

	uint32_t cnt_eval = 0;
	uint32_t limit_eval = ctrl.limit_eval.value_or(n);
	while(cand.size()>0)
	{
		if(cand.begin()->d>workset[0].d*ctrl.beta) break;

		if(++cnt_eval>limit_eval) break;

		nid_t u = cand.begin()->u;
		cand.erase(cand.begin());
		for(nid_t pv: g.get_edges(u))
		{
			const auto h_pv = parlay::hash64_2(pv)&mask;
			if(visited[h_pv]==pv) continue;
			visited[h_pv] = pv;
			cnt_visited++;

			const auto d = f_dist(g.get_node(pv).data);
			if(!(workset.size()<ef||d<workset[0].d)) continue;
			if(!is_inw.insert(pv).second) continue;

			cand.insert({d,pv});
			workset.push_back({d,pv});
			std::push_heap(workset.begin(), workset.end());
			if(workset.size()>ef)
			{
				std::pop_heap(workset.begin(), workset.end());
				// is_inw.erase(workset.back().u);
				workset.pop_back();
			}
			if(cand.size()>ef)
				cand.erase(std::prev(cand.end()));
		}
	}

	if(ctrl.count_cmps)
		*ctrl.count_cmps.value() += cnt_visited;

	return workset;
}

template<typename U, template<typename> class Allocator>
auto HNSW<U,Allocator>::search_layer(const node &u, const parlay::sequence<node_id> &eps, uint32_t ef, uint32_t l_c, search_control ctrl) const
{
	graph g(*this,l_c);
	/*
	dist_evaluator f_dist(u.data,dim);
	return beamSearch<dist>(g, f_dist, eps, ef, ctrl);
	*/
	QueryParams QP(ef, ef, 1.35, ctrl.limit_eval.value_or(n), get_threshold_m(l_c));
	auto points = parlay::delayed_seq<const T&>(node_pool.size(), [&](size_t i) -> const T&{
		return node_pool[i].data;
	});
	auto res = beam_search_impl<node_id>(u.data, g, points, eps, QP);
	const auto &pairElts = std::get<0>(res);
	const auto &frontier = std::get<0>(pairElts);
	if(ctrl.count_cmps)
		*ctrl.count_cmps.value() += std::get<1>(res);
	return parlay::tabulate(frontier.size(), [&](size_t i){
		const auto &f = frontier[i];
		return dist{f.second, f.first};
	});
}

template<typename U, template<typename> class Allocator>
auto HNSW<U,Allocator>::search_layer_bak(const node &u, const parlay::sequence<node_id> &eps, uint32_t ef, uint32_t l_c, search_control ctrl) const
{
	#define USE_HASHTBL
	// #define USE_BOOLARRAY
	// #define USE_UNORDERED_SET
#ifdef USE_HASHTBL
	const uint32_t bits = ef>2? std::ceil(std::log2(ef*ef))-2: 2;
	const uint32_t mask = (1u<<bits)-1;
	parlay::sequence<uint32_t> visited(mask+1, n+1);
#endif
#ifdef USE_BOOLARRAY
	std::vector<bool> visited(n+1);
#endif
	// TODO: Try hash to an array
	// TODO: monitor the size of `visited`
	uint32_t cnt_visited = 0;
#ifdef USE_UNORDERED_SET
	std::unordered_set<uint32_t> visited;
#endif
	parlay::sequence<dist> W, discarded;
	std::set<dist,farthest> C;
	std::set<node_id> w_inserted;
	W.reserve(ef+1);

	for(node_id ep : eps)
	{
	#ifdef USE_HASHTBL
		const auto id = U::get_id(get_node(ep).data);
		visited[parlay::hash64_2(id)&mask] = id;
	#endif
	#ifdef USE_BOOLARRAY
		visited[id] = true;
	#endif
	#ifdef USE_UNORDERED_SET
		visited.insert(U::get_id(get_node(ep).data));
	#endif
		cnt_visited++;
		const auto d = U::distance(u.data,get_node(ep).data,dim);
		C.insert({d,ep});
		W.push_back({d,ep});
		w_inserted.insert(ep);
	}
	// std::make_heap(C.begin(), C.end(), nearest());
	std::make_heap(W.begin(), W.end(), farthest());

	uint32_t cnt_eval = 0;
	uint32_t limit_eval = ctrl.limit_eval.value_or(n);
	while(C.size()>0)
	{
		if(ctrl.skip_search) break;
		if(C.begin()->d>W[0].d*ctrl.beta) break;

		if(++cnt_eval>limit_eval) break;
		if(ctrl.log_dist)
		{
			std::array<float,5> t;

			if(ctrl.log_size)
			{
				t[0] = W[0].d;
				t[1] = W.size();
				t[2] = C.size();
				vc_in_search[*ctrl.log_size].push_back(t);
			}

			auto it = C.begin();
			const auto step = C.size()/4;
			for(uint32_t i=0; i<4; ++i)
				t[i]=it->d, std::advance(it,step);
			t[4] = C.rbegin()->d;

			dist_in_search[*ctrl.log_dist].push_back(t);
		}

		const auto &c = get_node(C.begin()->u);
		// std::pop_heap(C.begin(), C.end(), nearest());
		// C.pop_back();
		C.erase(C.begin());
		for(node_id pv: neighbourhood(c, l_c))
		{
		#ifdef USE_HASHTBL
			const auto id = U::get_id(get_node(pv).data);
			const auto idx = parlay::hash64_2(id)&mask;
			if(visited[idx]==id) continue;
			visited[idx] = id;
		#endif
		#ifdef USE_BOOLARRAY
			if(visited[id]) continue;
			visited[id] = true;
		#endif
		#ifdef USE_UNORDERED_SET
			if(!visited.insert(U::get_id(get_node(pv).data)).second) continue;
		#endif
			cnt_visited++;
			const auto d = U::distance(u.data,get_node(pv).data,dim);
			if((W.size()<ef||d<W[0].d) && w_inserted.insert(pv).second)
			{
				C.insert({d,pv});
				// C.push_back({d,pv,dc+1});
				// std::push_heap(C.begin(), C.end(), nearest());
				W.push_back({d,pv});
				std::push_heap(W.begin(), W.end(), farthest());
				if(W.size()>ef)
				{
					std::pop_heap(W.begin(), W.end(), farthest());
					// w_inserted.erase(W.back().u);
					if(ctrl.radius && W.back().d<=*ctrl.radius)
						discarded.push_back(W.back());
					W.pop_back();
				}
				if(C.size()>ef)
					C.erase(std::prev(C.end()));
			}
		}
	}

	//total_visited += visited.size();
	//total_visited += visited.size()-std::count(visited.begin(),visited.end(),n+1);
	const auto id = parlay::worker_id();
	total_visited[id] += cnt_visited;
	total_size_C[id] += C.size()+cnt_eval;
	total_eval[id] += cnt_eval;

	if(ctrl.count_cmps)
		*ctrl.count_cmps.value() += cnt_visited;

	if(ctrl.radius)
	{
		const auto rad = *ctrl.radius;
		auto split = std::partition(W.begin(), W.end(), [rad](const dist &e){
			return e.d <= rad;
		});
		W.resize(split-W.begin());
		W.append(discarded);
		total_range_candidate[parlay::worker_id()] += W.size();
	}
	return W;
}

template<typename U, template<typename> class Allocator>
auto HNSW<U,Allocator>::search_layer_new_ex(const node &u, const parlay::sequence<node_id> &eps, uint32_t ef, uint32_t l_c, search_control ctrl) const
{
	auto verbose_output = [&](const char *fmt, ...){
		if(!ctrl.verbose_output) return;

		va_list args;
		va_start(args, fmt);
		vfprintf(stderr, fmt, args);
		va_end(args);
	};

	parlay::sequence<std::array<float,5>> dummy;
	auto &dist_range = ctrl.log_dist? dist_in_search[*ctrl.log_dist]: dummy;
	uint32_t cnt_eval = 0;

	auto *indeg = ctrl.verbose_output? get_indeg(l_c): reinterpret_cast<const uint32_t*>(node_pool.data());
	// parlay::sequence<bool> visited(n);
	// TODO: Try hash to an array
	// TODO: monitor the size of `visited`
	std::set<uint32_t> visited;
	// std::priority_queue<dist_ex,parlay::sequence<dist_ex>,nearest> C;
	// std::priority_queue<dist_ex,parlay::sequence<dist_ex>,farthest> W;
	parlay::sequence<dist_ex> /*C, W, */W_;
	std::set<dist_ex,farthest> C, C_acc;
	uint32_t cnt_used = 0;

	for(node_id ep : eps)
	{
		// visited[U::get_id(get_node(ep).data)] = true;
		const auto id = U::get_id(get_node(ep).data);
		visited.insert(id);
		const auto d = U::distance(u.data,get_node(ep).data,dim);
		C.insert({d,ep,1});
		C_acc.insert({d,ep,1});
		// C.push_back({d,ep,1});
		// W.push_back({d,ep,1});
		verbose_output("Insert\t[%u](%f) initially\n", id, d);
	}
	// std::make_heap(C.begin(), C.end(), nearest());
	// std::make_heap(W.begin(), W.end(), farthest());

	// static thread_local std::mt19937 gen{parlay::worker_id()};
	// static thread_local std::exponential_distribution<float> distro{48};
	while(C.size()>0)
	{
		// const auto &f = *(W[0].u);
		// if(U::distance(c.data,u.data,dim)>U::distance(f.data,u.data,dim))
		// if(C[0].d>W[0].d) break;
		if(C_acc.size()==cnt_used) break;
		cnt_eval++;

		if(ctrl.log_dist)
			dist_range.push_back({C.begin()->d,C.rbegin()->d});
		/*
		const auto dc = C[0].depth;
		const auto &c = *(C[0].u);
		*/
		auto it = C.begin();
		/*
		float quantile = distro(gen);
		if(quantile>C.size())
			quantile = C.size();
		const auto dis_min = C.begin()->d;
		const auto dis_max = C.rbegin()->d;
		const auto threshold = quantile/C.size()*(dis_max-dis_min) + dis_min - 1e-6;
		auto it = C.lower_bound(dist_ex{threshold,nullptr,0});
		*/
		const auto dc = it->depth;
		const auto &c = *(it->u);
		// W_.push_back(C[0]);
		W_.push_back(*it);
		// std::pop_heap(C.begin(), C.end(), nearest());
		// C.pop_back();
		C.erase(it);
		cnt_used++;

		verbose_output("------------------------------------\n");
		const uint32_t id_c = U::get_id(c.data);
		verbose_output("Eval\t[%u](%f){%u}\t[%u]\n", id_c, it->d, dc, indeg[id_c]);
		uint32_t cnt_insert = 0;
		for(node_id pv: neighbourhood(c, l_c))
		{
			// if(visited[U::get_id(get_node(pv).data)]) continue;
			// visited[U::get_id(get_node(pv).data)] = true;
			if(!visited.insert(U::get_id(get_node(pv).data)).second) continue;
			// const auto &f = *(W[0].u);
			// if(W.size()<ef||U::distance(get_node(pv).data,u.data,dim)<U::distance(f.data,u.data,dim))
			const auto d = U::distance(u.data,get_node(pv).data,dim);
			// if(W.size()<ef||d<W[0].d)
			// if(C.size()<ef||d<C.rend()->d)
			{
				// C.push_back({d,pv,dc+1});
				// std::push_heap(C.begin(), C.end(), nearest());
				/*
				W.push_back({d,pv,dc+1});
				std::push_heap(W.begin(), W.end(), farthest());
				if(W.size()>ef)
				{
					std::pop_heap(W.begin(), W.end(), farthest());
					W.pop_back();
				}
				*/
				if(C.size()<ef || d<C.rbegin()->d)
				{
				C.insert({d,pv,dc+1});
				const uint32_t id_v = U::get_id(get_node(pv).data);
				verbose_output("Insert\t[%u](%f){%u}\t[%u](%f)\n", 
					id_v, d, dc+1, 
					indeg[id_v], U::distance(c.data,get_node(pv).data,dim)
				);
				cnt_insert++;
				if(C.size()>ef)
				{
					// std::pop_heap(C.begin(), C.end(), nearest());
					// C.pop_back();
					C.erase(std::prev(C.end()));
				}
				}
				if(C_acc.size()<ef || d<C_acc.rbegin()->d)
				{
				C_acc.insert({d,pv,dc+1});
				if(C_acc.size()>ef)
				{
					auto it = std::prev(C_acc.end());
					if(std::find_if(W_.begin(), W_.end(), [&](const dist_ex &a){
						return a.u==it->u;
					})!=W_.end())
						cnt_used--;
					C_acc.erase(it);
				}
				}
			}
		}
		verbose_output("%u inserts in this round\n", cnt_insert);
	}
	if(l_c==0)
	{
		const auto id = parlay::worker_id();
		total_visited[id] += visited.size();
		total_size_C[id] += C.size()+cnt_eval;
		total_eval[id] += cnt_eval;
	}
	/*
	std::sort(W.begin(), W.end(), farthest());
	if(W.size()>ef) W.resize(ef);
	*/
	return W_;
}

template<typename U, template<typename> class Allocator>
auto HNSW<U,Allocator>::beam_search_ex(const node &u, const parlay::sequence<node_id> &eps, uint32_t beamSize, uint32_t l_c, search_control ctrl) const
// std::pair<parlay::sequence<dist_ex>, parlay::sequence<dist_ex>> beam_search(
		// T* p_coords, int beamSize)
{
	// beamSize *= 2;
	// beamSize = 20000;
	// initialize data structures
	parlay::sequence<dist_ex> visited;
	parlay::sequence<dist_ex> frontier;
	auto dist_less = [&](const dist_ex &a, const dist_ex &b) {
		return a.d < b.d || (a.d == b.d && a.u < b.u);
		// return a.u<b.u;
	};
	auto dist_eq = [&](const dist_ex &a, const dist_ex &b){
		return a.u == b.u;
	};

	// int bits = std::ceil(std::log2(beamSize * beamSize));
	// parlay::sequence<uint32_t> hash_table(1 << bits, std::numeric_limits<uint32_t>::max());
	std::set<uint32_t> accessed;

	auto make_pid = [&] (node_id ep) {
		const auto d = U::distance(u.data,get_node(ep).data,dim);
		return dist_ex{d,ep,1};
	};

	// the frontier starts with the medoid
	// frontier.push_back(make_pid(medoid->id));
	
	for(node_id ep : eps)
		frontier.push_back(make_pid(ep));
	std::sort(frontier.begin(), frontier.end(), dist_less);
	
	// frontier.push_back(make_pid(eps[0]));

	parlay::sequence<dist_ex> unvisited_frontier;
	// parlay::sequence<dist_ex> unvisited_frontier(beamSize);
	parlay::sequence<dist_ex> new_frontier;
	// parlay::sequence<dist_ex> new_frontier(2 * beamSize);
	bool not_done = true;


	for(size_t i=0; i<frontier.size(); ++i)
	{
		unvisited_frontier.push_back(frontier[i]);
		// unvisited_frontier[i] = frontier[i];
		accessed.insert(U::get_id(frontier[i].get_node(u).data));
	}

	// terminate beam search when the entire frontier has been visited
	while (not_done) {
		// the next node to visit is the unvisited frontier node that is closest
		// to p
		dist_ex currentPid = unvisited_frontier[0];
		node_id current_vtx = currentPid.u;
		debug_output("current_vtx ID: %u\n", U::get_id(get_node(current_vtx).data));

		auto g = [&](node_id a) {
			uint32_t id_a = U::get_id(get_node(a).data);
			/*
			uint32_t loc = parlay::hash64_2(id_a) & ((1 << bits) - 1);
			if (hash_table[loc] == id_a) return false;
			hash_table[loc] = id_a;
			return true;
			*/
			return accessed.insert(id_a).second;
		};

		parlay::sequence<node_id> candidates;
		auto f = [&](node_id pu, node_id pv/*, empty_weight wgh*/) {
			if (g(pv)) {
				candidates.push_back(pv);
			}
			return true;
		};
		for(node_id pv : neighbourhood(get_node(current_vtx),l_c))
			// current_vtx.out_neighbors().foreach_cond(f);
			f(current_vtx, pv);

		debug_output("candidates:\n");
		for(node_id p : candidates)
			debug_output("%u ", U::get_id(get_node(p).data));
		debug_output("\n");
		auto pairCandidates =
				parlay::map(candidates, make_pid);
		/*
		auto sortedCandidates =
				parlay::unique(parlay::sort(pairCandidates, dist_less), dist_eq);
		*/
		auto &sortedCandidates = pairCandidates;
		debug_output("size of sortedCandidates: %lu\n", sortedCandidates.size());
		/*
		auto f_iter = std::set_union(
				frontier.begin(), frontier.end(), sortedCandidates.begin(),
				sortedCandidates.end(), new_frontier.begin(), dist_less);\
		*/
		sortedCandidates.insert(sortedCandidates.end(), frontier);
		new_frontier = parlay::unique(parlay::sort(sortedCandidates,dist_less), dist_eq);

		// size_t f_size = std::min<size_t>(beamSize, f_iter - new_frontier.begin());
		size_t f_size = std::min<size_t>(beamSize, new_frontier.size());
		debug_output("f_size: %lu\n", f_size);

		debug_output("frontier (size: %lu)\n", frontier.size());
		for(const auto &e : frontier)
			debug_output("%u ", U::get_id(e.get_node(u).data));
		debug_output("\n");
		
		frontier =
				parlay::tabulate(f_size, [&](size_t i) { return new_frontier[i]; });
		debug_output("size of frontier: %lu\n", frontier.size());
		visited.insert(
				std::upper_bound(visited.begin(), visited.end(), currentPid, dist_less),
				currentPid);
		debug_output("size of visited: %lu\n", visited.size());
		unvisited_frontier.reserve(frontier.size());
		auto uf_iter =
				std::set_difference(frontier.begin(), frontier.end(), visited.begin(),
														visited.end(), unvisited_frontier.begin(), dist_less);
		debug_output("uf_iter - unvisited_frontier.begin(): %lu\n", uf_iter - unvisited_frontier.begin());
		not_done = uf_iter > unvisited_frontier.begin();

		if(l_c==0)
			total_visited[parlay::worker_id()] += candidates.size();
	}
	parlay::sequence<dist_ex> W;
	W.insert(W.end(), visited);
	return W;
}
/*
template<typename U, template<typename> class Allocator>
parlay::sequence<std::pair<uint32_t,float>> HNSW<U,Allocator>::search(const T &q, uint32_t k, uint32_t ef, search_control ctrl)
{
	auto res_ex = search_ex(q,k,ef,ctrl);
	parlay::sequence<std::pair<uint32_t,float>> res;
	res.reserve(res_ex.size());
	for(const auto &e : res_ex)
		res.emplace_back(std::get<0>(e), std::get<2>(e));

	return res;
}
*/

template<typename U, template<typename> class Allocator>
parlay::sequence<typename HNSW<U,Allocator>::node_id> HNSW<U,Allocator>::search_layer_to(
	const node &u, uint32_t ef, uint32_t l_stop, const search_control &ctrl)
{
	auto eps = entrance;
	for(uint32_t l_c=get_node(entrance[0]).level; l_c>l_stop; --l_c)
	{
		search_control c{};
		c.log_per_stat = ctrl.log_per_stat; // whether count dist calculations at all layers
		// c.limit_eval = ctrl.limit_eval; // whether apply the limit to all layers
		c.count_cmps = ctrl.count_cmps;
		const auto W = search_layer(u, eps, ef, l_c, c);
		eps.clear();
		eps.push_back(W[0].u);
		/*
		while(!W.empty())
		{
			eps.push_back(W.top().u);
			W.pop();
		}
		*/
	}
	return eps;
}

template<typename U, template<typename> class Allocator>
parlay::sequence<std::pair<uint32_t,float>> HNSW<U,Allocator>::search(
	const T &q, uint32_t k, uint32_t ef, const search_control &ctrl)
{
	const auto id = parlay::worker_id();
	total_range_candidate[id] = 0;
	total_visited[id] = 0;
	total_eval[id] = 0;
	total_size_C[id] = 0;

	node u{n, nullptr, q}; // To optimize
	// std::priority_queue<dist,parlay::sequence<dist>,farthest> W;
	parlay::sequence<node_id> eps;
	if(ctrl.indicate_ep)
		eps.push_back(*ctrl.indicate_ep);
	else
		eps = search_layer_to(u, 1, 0, ctrl);
	auto W_ex = search_layer(u, eps, ef, 0, ctrl);
	// auto W_ex = search_layer_new_ex(u, eps, ef, 0, ctrl);
	// auto W_ex = beam_search_ex(u, eps, ef, 0);
	// auto R = select_neighbors_simple(q, W_ex, k);

	auto &R = W_ex;
	if(!ctrl.radius && R.size()>k) // the range search ignores the given k
	{
		std::sort(R.begin(), R.end(), farthest());
		if(k>0)
			k = std::upper_bound(R.begin()+k, R.end(), R[k-1], farthest())-R.begin();
		R.resize(k);
	}

	parlay::sequence<std::pair<uint32_t,float>> res;
	res.reserve(R.size());
	/*
	while(W_ex.size()>0)
	{
		res.push_back({U::get_id(W_ex.top().get_node(u).data), W_ex.top().depth, W_ex.top().d});
		W_ex.pop();
	}
	*/
	for(const auto &e : R)
		res.push_back({U::get_id(get_node(e.u).data),/* e.depth,*/ e.d});
	return res;
}

template<typename U, template<typename> class Allocator>
void HNSW<U,Allocator>::save(const std::string &filename_model) const
{
	std::ofstream model(filename_model, std::ios::binary);
	if(!model.is_open())
		throw std::runtime_error("Failed to create the model");

	const auto size_buffer = 1024*1024*1024; // 1G
	auto buffer = std::make_unique<char[]>(size_buffer);
	model.rdbuf()->pubsetbuf(buffer.get(), size_buffer);

	const auto write = [&](const auto &data, auto ...args){
		auto write_impl = [&](auto &f, const auto &data, auto ...args){
			using T = std::remove_reference_t<decltype(data)>;
			if constexpr(std::is_pointer_v<std::decay_t<T>>)
			{
				auto write_array = [&](const auto &data, size_t size, auto ...args){
					for(size_t i=0; i<size; ++i)
						f(f, data[i], args...);
				};
				// use the array extent as the size
				if constexpr(sizeof...(args)==0 && std::is_array_v<T>)
				{
					write_array(data, std::extent_v<T>);
				}
				else
				{
					static_assert(sizeof...(args), "size was not provided");
					write_array(data, args...);
				}
			}
			else
			{
				static_assert(std::is_standard_layout_v<T>);
				model.write((const char*)&data, sizeof(data));
			}
		};
		write_impl(write_impl, data, args...);
	};
	// write header (version number, type info, etc)
	write("HNSW", 4);
	write(uint32_t(3)); // version
	write(typeid(U).hash_code()^sizeof(U));
	fprintf(stderr, "U type written %s\n", typeid(U).name());
	write(sizeof(node));
	// write parameter configuration
	write(dim);
	write(m_l);
	write(m);
	write(ef_construction);
	write(alpha);
	write(n);
	// write indices
	for(const auto &u : node_pool)
	{
		write(u.level);
		write(uint32_t(U::get_id(u.data)));
	}
	for(const auto &u : node_pool)
	{
		for(uint32_t l=0; l<=u.level; ++l)
		{
			write(u.neighbors[l].size());
			for(node_id pv : u.neighbors[l])
				write(pv);
		}
	}
	// write entrances
	write(entrance.size());
	for(node_id pu : entrance)
		write(pu);
} 

} // namespace HNSW

#endif // _HNSW_HPP


================================================
FILE: algorithms/HNSW/debug.hpp
================================================
#ifndef __DEBUG_HPP__
#define __DEBUG_HPP__

extern parlay::sequence<parlay::sequence<std::array<float,5>>> dist_in_search;
extern parlay::sequence<parlay::sequence<std::array<float,5>>> vc_in_search;
// extern parlay::sequence<uint32_t> round_in_search;
extern parlay::sequence<size_t> per_visited;
extern parlay::sequence<size_t> per_eval;
extern parlay::sequence<size_t> per_size_C;

#include <optional>

struct search_control{
	bool verbose_output;
	bool skip_search;
	float beta = 1;
	std::optional<float> radius;
	std::optional<uint32_t> log_per_stat;
	std::optional<uint32_t> log_dist;
	std::optional<uint32_t> log_size;
	std::optional<uint32_t> indicate_ep;
	std::optional<uint32_t> limit_eval;
	std::optional<uint32_t*> count_cmps;
};

#endif // _DEBUG_HPP_


================================================
FILE: algorithms/HNSW/dist.hpp
================================================
#ifndef __DIST_HPP__
#define __DIST_HPP__

#include <type_traits>
#include "type_point.hpp"
#include "../utils/NSGDist.h"

template<typename T>
class descr_ang
{
	using promoted_type = std::conditional_t<std::is_integral_v<T>&&sizeof(T)<=4,
		std::conditional_t<sizeof(T)==4, int64_t, int32_t>,
		float
	>;
public:
	typedef T type_elem;
	typedef point<T> type_point;
	static float distance(const type_point &u, const type_point &v, uint32_t dim)
	{
		const auto *uc=u.coord, *vc=v.coord;
		promoted_type dot=0, nu=0, nv=0;
		for(uint32_t i=0; i<dim; ++i)
		{
			nu += promoted_type(uc[i])*uc[i];
			nv += promoted_type(vc[i])*vc[i];
			dot += promoted_type(uc[i])*vc[i];
		}
		return 1-dot/(sqrtf(nu)*sqrtf(nv));
	}

	static auto get_id(const type_point &u)
	{
		return u.id;
	}
};

template<typename T>
class descr_ndot
{
	using promoted_type = std::conditional_t<std::is_integral_v<T>&&sizeof(T)<=4,
		std::conditional_t<sizeof(T)==4, int64_t, int32_t>,
		float
	>;
public:
	typedef T type_elem;
	typedef point<T> type_point;
	static float distance(const type_point &u, const type_point &v, uint32_t dim)
	{
		const auto *uc=u.coord, *vc=v.coord;
		promoted_type dot=0;
		for(uint32_t i=0; i<dim; ++i)
			dot += promoted_type(uc[i])*vc[i];
		return -float(dot);
	}

	static auto get_id(const type_point &u)
	{
		return u.id;
	}
};

template<typename T>
class descr_l2
{
	using promoted_type = std::conditional_t<std::is_integral_v<T>&&sizeof(T)<=4,
		std::conditional_t<sizeof(T)==4, int64_t, int32_t>,
		float
	>;
public:
	typedef T type_elem;
	typedef point<T> type_point;
	static float distance(const type_point &u, const type_point &v, uint32_t dim)
	{
		if constexpr(std::is_integral_v<T>)
		{
			const auto *uc=u.coord, *vc=v.coord;
			promoted_type sum = 0;
			for(uint32_t i=0; i<dim; ++i)
			{
				const auto d = promoted_type(uc[i])-vc[i];
				sum += d*d;
			}
			return sum;
		}
		else
		{
			const auto *uc=u.coord, *vc=v.coord;
			efanna2e::DistanceL2 distfunc;
			return distfunc.compare(uc, vc, dim);
		}
	}

	static auto get_id(const type_point &u)
	{
		return u.id;
	}
};

#endif // __DIST_HPP__


================================================
FILE: algorithms/HNSW/h5_ops.hpp
================================================
#ifndef __H5_OPS_HPP__
#define __H5_OPS_HPP__

#include <cstdio>
#include <array>
#include <tuple>
#include <memory>
#include <type_traits>
#include <H5Cpp.h>

// Return a {reader, dims} tuple
// reader(buffer, index, cnt) reads data with the first dimension 
// from  `index` to `index+cnt` and writes it into the 1D `buffer`
template<typename T>
auto get_reader(const char *file, const char *dir)
{
	H5::H5File file_h5(file, H5F_ACC_RDONLY);
	H5::DataSet dset = file_h5.openDataSet(dir);
	H5::DataSpace dspace_src = dset.getSpace();
	hsize_t dim[2];
	dspace_src.getSimpleExtentDims(dim);
	fprintf(stderr, "%s: [%llu,%llu]\n", dir, dim[0], dim[1]);

	H5::DataType type_dst = dset.getDataType();
	if constexpr(std::is_same_v<T,uint32_t>)
		type_dst = H5::PredType::NATIVE_UINT32;
	else if constexpr(std::is_same_v<T,int32_t>)
		type_dst = H5::PredType::NATIVE_INT32;
	else if constexpr(std::is_same_v<T,uint8_t>)
		type_dst = H5::PredType::NATIVE_UINT8;
	else if constexpr(std::is_same_v<T,int8_t>)
		type_dst = H5::PredType::NATIVE_INT8;
	else if constexpr(std::is_same_v<T,float>)
		type_dst = H5::PredType::NATIVE_FLOAT;
	else static_assert(std::is_same_v<T,uint32_t>/*always false*/, "Unsupported type");

	auto reader = [=,_=std::move(file_h5)](T *buffer, hsize_t index, hsize_t cnt=1){
		hsize_t size = dim[1]*cnt;
		H5::DataSpace dspace_dst(1,&size,NULL);

		hsize_t offset[2] = {index, 0};
		hsize_t count[2] = {cnt, dim[1]};
		H5::DataSpace dspace_slice;
		dspace_slice.copy(dspace_src);
		dspace_slice.selectHyperslab(H5S_SELECT_SET, count, offset);

		dset.read(buffer, type_dst, dspace_dst, dspace_slice);
	};

	return std::tuple{reader, std::array{dim[0], dim[1]}};
}

// read a 2D array from H5 file and return a 1D array
template<typename T>
std::pair<std::unique_ptr<T[]>,std::array<hsize_t,2>> read_array_from_HDF5(const char *file, const char *dir)
{
	auto [reader,dims] = get_reader<T>(file, dir);
	auto buffer = std::make_unique<T[]>(dims[0]*dims[1]);
	reader(buffer.get(), 0, dims[0]);
	return {std::move(buffer), dims};
}

#endif // __H5_OPS_HPP__


================================================
FILE: algorithms/HNSW/type_point.hpp
================================================
#ifndef __TYPE_POINT_HPP__
#define __TYPE_POINT_HPP__

#include <cstdint>
#include <cstddef>
#include <iterator>
#include <algorithm>
#include <memory>
#include <type_traits>
#include <stdexcept>
#include <any>
#include "benchUtils.h"

#ifdef SUPPORT_HDF5
#include "h5_ops.hpp"
#endif

class internal_termination{
protected:
	internal_termination(){}
	internal_termination(int){std::terminate();}
};

template<typename T>
class fake_copyable : public internal_termination{
	T content;
public:
	fake_copyable(const T &c) : content(c){}
	fake_copyable(T &&c) : content(std::move(c)){}

	fake_copyable(fake_copyable&&) = default;
	fake_copyable(const fake_copyable &other [[maybe_unused]])
	// The users have to guarantee to hold the points while it is being used in graph.
	// Otherwise, uncomment the following guarding code and forbid copy constructions
	// or alternatively pass in copy-constructible objects (e.g., `std::shared_ptr`) 
	// to `point` instead of using this hack
	/*
		: internal_termination(0), 
		  content(std::move(const_cast<fake_copyable&>(other).content))
	*/
		: internal_termination()
	{
	}
};
template<typename T>
fake_copyable(const T&) -> fake_copyable<T>;
template<typename T>
fake_copyable(T&&) -> fake_copyable<T>;

template<typename T>
struct point
{
	typedef T type;

	uint32_t id;
	const T *coord;

	point()
		: id(~0u), coord(NULL), closure()
	{
	}
	point(uint32_t id_, const T *coord_)
		: id(id_), coord(coord_), closure()
	{
	}
	template<class C>
	point(uint32_t id_, const T *coord_, C &&closure_)
		: id(id_), coord(coord_), closure(std::forward<C>(closure_))
	{
	}
private:
	std::any closure;
};

enum class file_format{
	VEC, HDF5, BIN
};

template<typename T>
class point_converter_default
{
public:
	using type = point<T>;

	template<typename Iter>
	type operator()(uint32_t id, Iter begin, [[maybe_unused]] Iter end)
	{
		using type_src = typename std::iterator_traits<Iter>::value_type;
		static_assert(std::is_convertible_v<type_src,T>, "Cannot convert to the target type");

		if constexpr(std::is_same_v<Iter,ptr_mapped<T,ptr_mapped_src::PERSISTENT>>||
			std::is_same_v<Iter,ptr_mapped<const T,ptr_mapped_src::PERSISTENT>>)
			return point<T>(id, &*begin);
		else if constexpr(std::is_same_v<Iter,ptr_mapped<T,ptr_mapped_src::TRANSITIVE>>||
			std::is_same_v<Iter,ptr_mapped<const T,ptr_mapped_src::TRANSITIVE>>)
		{
			const T *p = &*begin; // TODO: fix the type to T(*)[]
			return point<T>(id, p, fake_copyable(std::unique_ptr<const T>(p)));
		}
		else
		{
			const uint32_t dim = std::distance(begin, end);

			// T *coord = new T[dim];
			auto coord = std::make_unique<T[]>(dim);
			for(uint32_t i=0; i<dim; ++i)
				coord[i] = *(begin+i);
			return point<T>(id, coord.get(), fake_copyable(std::move(coord)));
		}
	}
};

template<typename Src, class Conv>
inline std::pair<parlay::sequence<typename Conv::type>,uint32_t>
load_from_vec(const char *file, Conv converter, uint32_t max_num)
{
	const auto [fileptr, length] = mmapStringFromFile(file);

	// Each vector is 4 + sizeof(Src)*dim bytes.
	// * first 4 bytes encode the dimension (as an uint32_t)
	// * next dim values are Src-type variables representing vector components
	// See http://corpus-texmex.irisa.fr/ for more details.

	const uint32_t dim = *((const uint32_t*)fileptr);
	std::cout << "Dimension = " << dim << std::endl;

	const size_t vector_size = sizeof(dim) + sizeof(Src)*dim;
	const uint32_t n = std::min<size_t>(length/vector_size, max_num);
	// std::cout << "Num vectors = " << n << std::endl;

	typedef ptr_mapped<const Src,ptr_mapped_src::PERSISTENT> type_ptr;
	parlay::sequence<typename Conv::type> ps(n);

	parlay::parallel_for(0, n, [&,fp=fileptr] (size_t i) {
		const Src *coord = (const Src*)(fp+sizeof(dim)+i*vector_size);
		ps[i] = converter(i, type_ptr(coord), type_ptr(coord+dim));
	});

	return {std::move(ps), dim};
}

template<class, class=void>
class trait_type{
};

template<class T>
class trait_type<T,std::void_t<typename T::type>>{
public:
	using type = typename T::type;
};

template<class T>
class trait_type<T*,void>{
public:
	using type = T;
};

template<class T>
class trait_type<parlay::sequence<T>,void>{
public:
	using type = T;
};

template<class Conv>
inline std::pair<parlay::sequence<typename Conv::type>,uint32_t>
load_from_HDF5(const char *file, const char *dir, Conv converter, uint32_t max_num)
{
#ifndef SUPPORT_HDF5
	(void)file;
	(void)dir;
	(void)converter;
	(void)max_num;
	throw std::invalid_argument("HDF5 support is not enabled");
#else
	using T = typename trait_type<typename Conv::type>::type;
	auto [reader,bound] = get_reader<T>(file, dir);
	const size_t n = std::min<size_t>(bound[0], max_num);
	const uint32_t dim = bound[1];

	parlay::sequence<typename Conv::type> ps(n);
	// TODO: parallel for-loop
	for(uint32_t i=0; i<n; ++i){
		T *coord = new T[dim];
		reader(coord, i);
		typedef ptr_mapped<T,ptr_mapped_src::TRANSITIVE> type_ptr;
		ps[i] = converter(i, type_ptr(coord), type_ptr(coord+dim));
	}
	return {std::move(ps), dim};
#endif
}

template<typename Src, class Conv>
inline std::pair<parlay::sequence<typename Conv::type>,uint32_t>
load_from_bin(const char *file, Conv converter, uint32_t max_num)
{
	auto [fileptr, length] = mmapStringFromFile(file); (void)length;
	const uint32_t n = std::min(max_num, *((uint32_t*)fileptr));
	const uint32_t dim = *((uint32_t*)(fileptr+sizeof(n)));
	const size_t vector_size = sizeof(Src)*dim;
	const size_t header_size = sizeof(n)+sizeof(dim);

	typedef ptr_mapped<const Src,ptr_mapped_src::PERSISTENT> type_ptr;
	parlay::sequence<typename Conv::type> ps(n);
	parlay::parallel_for(0, n, [&,fp=fileptr](uint32_t i){
		const Src *coord = (const Src*)(fp+header_size+i*vector_size);
		ps[i] = converter(i, type_ptr(coord), type_ptr(coord+dim));
	});

	return {std::move(ps), dim};
}

template<typename Src, class Conv>
inline std::pair<parlay::sequence<typename Conv::type>,uint32_t>
load_from_range(const char *file, Conv converter, uint32_t max_num)
{
	auto [fileptr, length] = mmapStringFromFile(file); (void)length;
	const int32_t num_points = *(int32_t*)fileptr;
	const int32_t num_matches = *(int32_t*)(fileptr+sizeof(num_points));
	const size_t header_size = sizeof(num_points)+sizeof(num_matches);

	int32_t* begin = (int32_t*)(fileptr+header_size);
	int32_t* end = begin + num_points;
	auto [offsets, total] = parlay::scan(parlay::make_slice(begin,end));
	offsets.push_back(total);
	std::cout << "num_matches: " << num_matches << ' ' << total << std::endl;

	const size_t index_size = header_size+num_points*sizeof(*begin);
	std::cout << "index_size: " << index_size << std::endl;

	typedef ptr_mapped<const Src,ptr_mapped_src::PERSISTENT> type_ptr;
	const uint32_t n = std::min<uint32_t>(max_num, num_points);
	parlay::sequence<typename Conv::type> ps(n);
	parlay::parallel_for(0, n, [&,fp=fileptr](uint32_t i){
		const Src *begin = (const Src*)(fp+index_size+offsets[i]*sizeof(Src));
		const Src *end = (const Src*)(fp+index_size+offsets[i+1]*sizeof(Src));
		ps[i] = converter(i, type_ptr(begin), type_ptr(end));
	});

	return {std::move(ps), 0};
}
/*
template<typename Src=void, class Conv>
inline auto load_point(const char *file, file_format input_format, Conv converter, size_t max_num=0, std::any aux={})
{
	if(!max_num)
		max_num = std::numeric_limits<decltype(max_num)>::max();

	switch(input_format)
	{
	case file_format::VEC:
		return load_from_vec<Src>(file, converter, max_num);
	case file_format::HDF5:
		return load_from_HDF5(file, std::any_cast<const char*>(aux), converter, max_num);
	case file_format::BIN:
		return load_from_bin<Src>(file, converter, max_num);
	default:
		__builtin_unreachable();
	}
}
*/
template<class Conv>
inline auto load_point(const char *input_name, Conv converter, size_t max_num=0)
{
	auto buffer = std::make_unique<char[]>(strlen(input_name)+1);
	strcpy(buffer.get(), input_name);

	char *splitter = strchr(buffer.get(), ':');
	if(splitter==nullptr)
		throw std::invalid_argument("The input spec is not specified");

	*(splitter++) = '\0';
	const char *file = buffer.get();
	const char *input_spec = splitter;

	if(!max_num)
		max_num = std::numeric_limits<decltype(max_num)>::max();

	if(input_spec[0]=='/')
		return load_from_HDF5(file, input_spec, converter, max_num);
	if(!strcmp(input_spec,"fvecs"))
		return load_from_vec<float>(file, converter, max_num);
	if(!strcmp(input_spec,"bvecs"))
		return load_from_vec<uint8_t>(file, converter, max_num);
	if(!strcmp(input_spec,"ivecs"))
		return load_from_vec<int32_t>(file, converter, max_num);
	if(!strcmp(input_spec,"u8bin"))
		return load_from_bin<uint8_t>(file, converter, max_num);
	if(!strcmp(input_spec,"i8bin"))
		return load_from_bin<int8_t>(file, converter, max_num);
	if(!strcmp(input_spec,"ibin"))
		return load_from_bin<int32_t>(file, converter, max_num);
	if(!strcmp(input_spec,"ubin"))
		return load_from_bin<uint32_t>(file, converter, max_num);
	if(!strcmp(input_spec,"fbin"))
		return load_from_bin<float>(file, converter, max_num);
	if(!strcmp(input_spec,"irange"))
		return load_from_range<int32_t>(file, converter, max_num);

	throw std::invalid_argument("Unsupported input spec");
}

#endif // __TYPE_POINT_HPP_


================================================
FILE: algorithms/bench/BUILD
================================================
package(default_visibility = ["//visibility:public"])

cc_library(
  name = "parse_command_line",
  hdrs = ["parse_command_line.h"],
)

================================================
FILE: algorithms/bench/IO.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <iostream>
#include <fstream>
#include <string>
#include <string>
#include <cstring>
#include "parlay/primitives.h"
#include "parlay/parallel.h"
#include "parlay/io.h"
#include "parlay/internal/get_time.h"

namespace benchIO {
  using namespace std;
  using parlay::sequence;
  using parlay::tabulate;
  using parlay::make_slice;

  auto is_space = [] (char c) {
    switch (c)  {
    case '\r': 
    case '\t': 
    case '\n': 
    case 0:
    case ' ' : return true;
    default : return false;
    }
  };

  // parallel code for converting a string to word pointers
  // side effects string by setting to null after each word
  template <class Seq>
    parlay::sequence<char*> stringToWords(Seq &Str) {
    size_t n = Str.size();
    
    parlay::parallel_for(0, n, [&] (long i) {
	if (is_space(Str[i])) Str[i] = 0;}); 

    // mark start of words
    auto FL = parlay::tabulate(n, [&] (long i) -> bool {
	return (i==0) ? Str[0] : Str[i] && !Str[i-1];});
    
    // offset for each start of word
    auto Offsets = parlay::pack_index<long>(FL);

    // pointer to each start of word
    auto SA = parlay::tabulate(Offsets.size(), [&] (long j) -> char* {
	return Str.begin() + Offsets[j];});
    
    return SA;
  }

  //using this as a typename so we can replace with parlay::chars easily if desired
  using charstring = typename parlay::sequence<char>;

  inline int xToStringLen(charstring const &a) { return a.size();}
  inline void xToString(char* s, charstring const &a) {
    for (int i=0; i < a.size(); i++) s[i] = a[i];}

  inline int xToStringLen(long a) { return 21;}
  inline void xToString(char* s, long a) { sprintf(s,"%ld",a);}

  inline int xToStringLen(unsigned long a) { return 21;}
  inline void xToString(char* s, unsigned long a) { sprintf(s,"%lu",a);}

  inline uint xToStringLen(uint a) { return 12;}
  inline void xToString(char* s, uint a) { sprintf(s,"%u",a);}

  inline int xToStringLen(int a) { return 12;}
  inline void xToString(char* s, int a) { sprintf(s,"%d",a);}

  inline int xToStringLen(double a) { return 18;}
  inline void xToString(char* s, double a) { sprintf(s,"%.11le", a);}

  inline int xToStringLen(char* a) { return strlen(a)+1;}
  inline void xToString(char* s, char* a) { sprintf(s,"%s",a);}

  template <class A, class B>
  inline int xToStringLen(pair<A,B> a) { 
    return xToStringLen(a.first) + xToStringLen(a.second) + 1;
  }

  template <class A, class B>
  inline void xToString(char* s, pair<A,B> a) { 
    int l = xToStringLen(a.first);
    xToString(s, a.first);
    s[l] = ' ';
    xToString(s+l+1, a.second);
  }

  template <class Seq>
  charstring seqToString(Seq const &A) {
    size_t n = A.size();
    auto L = parlay::tabulate(n, [&] (size_t i) -> long {
	typename Seq::value_type x = A[i];
	return xToStringLen(x)+1;});
    size_t m;
    std::tie(L,m) = parlay::scan(std::move(L));

    charstring B(m+1, (char) 0);
    char* Bs = B.begin();

    parlay::parallel_for(0, n-1, [&] (long i) {
      xToString(Bs + L[i], A[i]);
      Bs[L[i+1] - 1] = '\n';
      });
    xToString(Bs + L[n-1], A[n-1]);
    Bs[m] = Bs[m-1] = '\n';
    
    charstring C = parlay::filter(B, [&] (char c) {return c != 0;}); 
    C[C.size()-1] = 0;
    return C;
  }

  template <class T>
  void writeSeqToStream(ofstream& os, parlay::sequence<T> const &A) {
    size_t bsize = 10000000;
    size_t offset = 0;
    size_t n = A.size();
    while (offset < n) {
      // Generates a string for a sequence of size at most bsize
      // and then wrties it to the output stream
      charstring S = seqToString(A.cut(offset, min(offset + bsize, n)));
      os.write(S.begin(), S.size()-1);
      offset += bsize;
    }
  }

  template <class T>
  int writeSeqToFile(string header,
		     parlay::sequence<T> const &A,
		     char const *fileName) {
    auto a = A[0];
    //xToStringLena(a);
    ofstream file (fileName, ios::out | ios::binary);
    if (!file.is_open()) {
      std::cout << "Unable to open file: " << fileName << std::endl;
      return 1;
    }
    file << header << endl;
    writeSeqToStream(file, A);
    file.close();
    return 0;
  }

  template <class T1, class T2>
  int write2SeqToFile(string header,
		      parlay::sequence<T1> const &A,
		      parlay::sequence<T2> const &B,
		      char const *fileName) {
    ofstream file (fileName, ios::out | ios::binary);
    if (!file.is_open()) {
      std::cout << "Unable to open file: " << fileName << std::endl;
      return 1;
    }
    file << header << endl;
    writeSeqToStream(file, A);
    writeSeqToStream(file, B);
    file.close();
    return 0;
  }

  charstring readStringFromFile(char const *fileName) {
    ifstream file (fileName, ios::in | ios::binary | ios::ate);
    if (!file.is_open()) {
      std::cout << "Unable to open file: " << fileName << std::endl;
      abort();
    }
    long end = file.tellg();
    file.seekg (0, ios::beg);
    long n = end - file.tellg();
    charstring bytes(n, (char) 0);
    file.read (bytes.begin(), n);
    file.close();
    return bytes;
  }

  string intHeaderIO = "sequenceInt";

  template <class T>
  int writeIntSeqToFile(parlay::sequence<T> const &A, char const *fileName) {
    return writeSeqToFile(intHeaderIO, A, fileName);
  }

  sequence<sequence<char>> get_tokens(char const *fileName) {
    // parlay::internal::timer t("get_tokens");
    // auto S = parlay::chars_from_file(fileName);
    auto S = parlay::file_map(fileName);
    // t.next("file map");
    auto r =  parlay::tokens(S, benchIO::is_space);
    // t.next("tokens");
    return r;
  }

  template <class T>
  parlay::sequence<T> readIntSeqFromFile(char const *fileName) {
    auto W = get_tokens(fileName);
    string header(W[0].begin(),W[0].end());
    if (header != intHeaderIO) {
      cout << "readIntSeqFromFile: bad input" << endl;
      abort();
    }
    long n = W.size()-1;
    auto A = parlay::tabulate(n, [&] (long i) -> T {
	return parlay::chars_to_long(W[i+1]);});
    return A;
  }
};


================================================
FILE: algorithms/bench/MakeBench
================================================
# ********************
# GENERIC MAKEFILE FOR MOST BENCHMARKS THAT #include <name>.h
# USES FOLLOWING DEFINITIONS
#    BENCH : the name of the benchmark
#    REQUIRE : dependences
#    CC : the compiler
#    CFLAGS : compiler flags
#    LFLAGS : compiler link flags
# ********************

TIME = ../bench/$(BENCH)Time.C
INCLUDE = -I ../../parlaylib/include/

all : $(BENCH) 

$(BENCH) : $(TIME) $(BENCH).h $(REQUIRE)
	$(CC) -DSTATS $(CFLAGS) $(INCLUDE) -include $(BENCH).h -o $(BENCH) $(TIME) $(LFLAGS)

clean :
	rm -f $(BENCH)

cleanall : clean
	rm -f testInputs*; cd ../bench; make -s clean


================================================
FILE: algorithms/bench/Makefile
================================================
include parallelDefsANN
BNCHMRK = neighbors

CHECKFILES = $(BNCHMRK)Check.o

COMMON =

INCLUDE = -Icommon

%.o : %.C $(COMMON)
	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@

# $(BNCHMRK)Check : $(CHECKFILES)
# 	$(CC) $(LFLAGS) -o $@ $(CHECKFILES)

clean :
	rm -f $(BNCHMRK)Check *.o *.pyc


================================================
FILE: algorithms/bench/benchUtils.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#ifndef __BENCHUTILS_H__
#define __BENCHUTILS_H__

#include <iostream>
#include <algorithm>
#include <iterator>
#include <type_traits>
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "common/geometry.h"
#include "common/geometryIO.h"
#include "common/parse_command_line.h"
// #include "../utils/types.h"
// #include "common/time_loop.h"

#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

using namespace benchIO;

enum class ptr_mapped_src{
  NATIVE, VOLATILE, PERSISTENT, TRANSITIVE
};

namespace detail{

template<typename T, ptr_mapped_src Src>
class ptr_mapped_impl
{
  T *ptr_raw;
public:
  using difference_type = std::ptrdiff_t;
  using value_type = std::remove_cv_t<T>;
  using pointer = T*;
  using reference = T&;
  using iterator_category = std::random_access_iterator_tag;

  ptr_mapped_impl(){
  }

  ptr_mapped_impl(T *p) : ptr_raw(p){
  }

  template<typename U, ptr_mapped_src SrcOther>
  ptr_mapped_impl(const ptr_mapped_impl<U,SrcOther> &ptr) :
    ptr_raw(ptr.get()){
    static_assert(std::is_convertible_v<U*,T*>);
  }

  ptr_mapped_impl& operator=(T *p){
    ptr_raw = p;
    return *this;
  }

  template<typename U, ptr_mapped_src SrcOther>
  ptr_mapped_impl& operator=(const ptr_mapped_impl<U,SrcOther> &ptr){
    static_assert(std::is_convertible_v<U*,T*>);
    ptr_raw = ptr.get();
  }

  T* get() const{
    return ptr_raw;
  }

  operator T*() const{
    return get();
  }

  // For simplicity, we only keep the least methods to satisfy the requirements of LegacyIterator

  T& operator*() const{
    return *get();
  }

  ptr_mapped_impl& operator++(){
    ++ptr_raw;
    return *this;
  }

  ptr_mapped_impl& operator+=(size_t n){
    ptr_raw += n;
    return *this;
  }

  ptr_mapped_impl operator+(size_t n) const{
    return ptr_raw+n;
  }

  ptr_mapped_impl& operator-=(size_t n){
    ptr_raw -= n;
    return *this;
  }

  ptr_mapped_impl operator-(size_t n) const{
    return ptr_raw - n;
  }

  difference_type operator-(const ptr_mapped_impl &other) const{
    return ptr_raw - other.ptr_raw;
  }

  reference operator[](size_t i) const{
    return ptr_raw[i];
  }

  bool operator<(const ptr_mapped_impl &other) const{
    return ptr_raw < other.ptr_raw;
  }

  bool operator>(const ptr_mapped_impl &other) const{
    return other<*this;
  }

  bool operator>=(const ptr_mapped_impl &other) const{
    return !(*this<other);
  }

  bool operator<=(const ptr_mapped_impl &other) const{
    return !(*this>other);
  }
};

} // namespace detail

template<typename T, ptr_mapped_src Src>
using ptr_mapped = std::conditional_t<Src==ptr_mapped_src::NATIVE, T*, detail::ptr_mapped_impl<T,Src>>;
/*
template<typename T, ptr_mapped_src Src>
struct std::iterator_traits<detail::ptr_mapped_impl<T,Src>>
{
  using difference_type = std::ptrdiff_t;
  using value_type = std::remove_cv_t<T>;
  using pointer = T*;
  using reference = T&;
  using iterator_category = void;
};
*/
// *************************************************************
//  SOME DEFINITIONS
// *************************************************************


// *************************************************************
// Parsing code (should move to common?)
// *************************************************************

// returns a pointer and a length
std::pair<char*, size_t> mmapStringFromFile(const char* filename) {
  struct stat sb;
  int fd = open(filename, O_RDONLY);
  if (fd == -1) {
    perror("open");
    exit(-1);
  }
  if (fstat(fd, &sb) == -1) {
    perror("fstat");
    exit(-1);
  }
  if (!S_ISREG(sb.st_mode)) {
    perror("not a file\n");
    exit(-1);
  }
  char* p =
      static_cast<char*>(mmap(0, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0));
  if (p == MAP_FAILED) {
    perror("mmap");
    exit(-1);
  }
  if (close(fd) == -1) {
    perror("close");
    exit(-1);
  }
  size_t n = sb.st_size;
  return std::make_pair(p, n);
}

/*
auto parse_fvecs(const char* filename)
{
  return parse_vecs<float>(filename, [](size_t id, auto begin, auto end){
    typedef typename std::iterator_traits<decltype(begin)>::value_type type_elem;
    static_assert(std::is_same_v<decltype(begin),ptr_mapped<type_elem,ptr_mapped_src::DISK>>);

    Tvec_point<type_elem> point;
    point.id = id;
    point.coordinates = parlay::make_slice(begin.get(), end.get());
    return point;
  }).first;
}

auto parse_ivecs(const char* filename)
{
  return parse_vecs<float>(filename, [](size_t id, auto begin, auto end){
    typedef typename std::iterator_traits<decltype(begin)>::value_type type_elem;
    static_assert(std::is_same_v<decltype(begin),ptr_mapped<type_elem,ptr_mapped_src::DISK>>);

    ivec_point point;
    point.id = id;
    point.coordinates = parlay::make_slice(begin.get(), end.get());
    return point;
  }).first;
}
*/

// auto parse_bvecs_to_fvecs(const char* filename) {

//   using slice_f = typename parlay::slice<float*, float*>;
//   auto [fileptr, length] = mmapStringFromFile(filename);
//   // std::cout << "Successfully mmap'd" << std::endl;

//   // Each vector is 4 + d bytes.
//   // * first 4 bytes encode the dimension (as an integer)
//   // * next d values are unsigned chars representing vector components
//   // See http://corpus-texmex.irisa.fr/ for more details.

//   int d = *((int*)fileptr);
//   std::cout << "Dimension = " << d << std::endl;

//   size_t vector_size = 4 + d;
//   size_t num_vectors = length / vector_size;
//   std::cout << "Num vectors = " << num_vectors << std::endl;

//   parlay::sequence<fvec_point> points(num_vectors);

//   // parlay::parallel_for(0, num_vectors, [&] (size_t i) {
//   //   size_t offset_in_bytes = vector_size * i + 4;  // skip dimension
//   //   float* start = (float*)(fileptr + offset_in_bytes);
//   //   float* end = start + d;
//   //   points[i].id = i; 
//   //   points[i].coordinates = parlay::make_slice(start, end);
//   // });

//   parlay::parallel_for(0, num_vectors, [&] (size_t i) {
//     size_t offset_in_bytes = vector_size * i + 4;  // skip dimension
//     points[i].id = i; 
//     unsigned char* start = (unsigned char*)(fileptr + offset_in_bytes);
//     parlay::sequence<float> coords = *new parlay::sequence<float>(d);
//     for(int j=0; j<d; j++){
//       float elt = *new float;
//       elt = static_cast<float>(*(start+j));
//       coords[j] = elt;
//       // std::cout << coords[j] << std::endl; 
//     }
//     slice_f slicecoords = *new slice_f(coords.begin(), coords.end());
//     // slicecoords =  parlay::make_slice(coords.begin(), coords.end());
//     points[i].coordinates = slicecoords;
//   });


//   return points;
// }

#endif // __BENCHUTILS_H__


================================================
FILE: algorithms/bench/common/IO.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <iostream>
#include <fstream>
#include <string>
#include <string>
#include <cstring>
#include "../parlay/primitives.h"
#include "../parlay/parallel.h"
#include "../parlay/io.h"
#include "../parlay/internal/get_time.h"

namespace benchIO {
  using namespace std;
  using parlay::sequence;
  using parlay::tabulate;
  using parlay::make_slice;

  auto is_space = [] (char c) {
    switch (c)  {
    case '\r': 
    case '\t': 
    case '\n': 
    case 0:
    case ' ' : return true;
    default : return false;
    }
  };

  // parallel code for converting a string to word pointers
  // side effects string by setting to null after each word
  template <class Seq>
    parlay::sequence<char*> stringToWords(Seq &Str) {
    size_t n = Str.size();
    
    parlay::parallel_for(0, n, [&] (long i) {
	if (is_space(Str[i])) Str[i] = 0;}); 

    // mark start of words
    auto FL = parlay::tabulate(n, [&] (long i) -> bool {
	return (i==0) ? Str[0] : Str[i] && !Str[i-1];});
    
    // offset for each start of word
    auto Offsets = parlay::pack_index<long>(FL);

    // pointer to each start of word
    auto SA = parlay::tabulate(Offsets.size(), [&] (long j) -> char* {
	return Str.begin() + Offsets[j];});
    
    return SA;
  }

  //using this as a typename so we can replace with parlay::chars easily if desired
  using charstring = typename parlay::sequence<char>;

  inline int xToStringLen(charstring const &a) { return a.size();}
  inline void xToString(char* s, charstring const &a) {
    for (int i=0; i < a.size(); i++) s[i] = a[i];}

  inline int xToStringLen(long a) { return 21;}
  inline void xToString(char* s, long a) { sprintf(s,"%ld",a);}

  inline int xToStringLen(unsigned long a) { return 21;}
  inline void xToString(char* s, unsigned long a) { sprintf(s,"%lu",a);}

  inline uint xToStringLen(uint a) { return 12;}
  inline void xToString(char* s, uint a) { sprintf(s,"%u",a);}

  inline int xToStringLen(int a) { return 12;}
  inline void xToString(char* s, int a) { sprintf(s,"%d",a);}

  inline int xToStringLen(double a) { return 18;}
  inline void xToString(char* s, double a) { sprintf(s,"%.11le", a);}

  inline int xToStringLen(char* a) { return strlen(a)+1;}
  inline void xToString(char* s, char* a) { sprintf(s,"%s",a);}

  template <class A, class B>
  inline int xToStringLen(pair<A,B> a) { 
    return xToStringLen(a.first) + xToStringLen(a.second) + 1;
  }

  template <class A, class B>
  inline void xToString(char* s, pair<A,B> a) { 
    int l = xToStringLen(a.first);
    xToString(s, a.first);
    s[l] = ' ';
    xToString(s+l+1, a.second);
  }

  template <class Seq>
  charstring seqToString(Seq const &A) {
    size_t n = A.size();
    auto L = parlay::tabulate(n, [&] (size_t i) -> long {
	typename Seq::value_type x = A[i];
	return xToStringLen(x)+1;});
    size_t m;
    std::tie(L,m) = parlay::scan(std::move(L));

    charstring B(m+1, (char) 0);
    char* Bs = B.begin();

    parlay::parallel_for(0, n-1, [&] (long i) {
      xToString(Bs + L[i], A[i]);
      Bs[L[i+1] - 1] = '\n';
      });
    xToString(Bs + L[n-1], A[n-1]);
    Bs[m] = Bs[m-1] = '\n';
    
    charstring C = parlay::filter(B, [&] (char c) {return c != 0;}); 
    C[C.size()-1] = 0;
    return C;
  }

  template <class T>
  void writeSeqToStream(ofstream& os, parlay::sequence<T> const &A) {
    size_t bsize = 10000000;
    size_t offset = 0;
    size_t n = A.size();
    while (offset < n) {
      // Generates a string for a sequence of size at most bsize
      // and then wrties it to the output stream
      charstring S = seqToString(A.cut(offset, min(offset + bsize, n)));
      os.write(S.begin(), S.size()-1);
      offset += bsize;
    }
  }

  template <class T>
  int writeSeqToFile(string header,
		     parlay::sequence<T> const &A,
		     char const *fileName) {
    auto a = A[0];
    //xToStringLena(a);
    ofstream file (fileName, ios::out | ios::binary);
    if (!file.is_open()) {
      std::cout << "Unable to open file: " << fileName << std::endl;
      return 1;
    }
    file << header << endl;
    writeSeqToStream(file, A);
    file.close();
    return 0;
  }

  template <class T1, class T2>
  int write2SeqToFile(string header,
		      parlay::sequence<T1> const &A,
		      parlay::sequence<T2> const &B,
		      char const *fileName) {
    ofstream file (fileName, ios::out | ios::binary);
    if (!file.is_open()) {
      std::cout << "Unable to open file: " << fileName << std::endl;
      return 1;
    }
    file << header << endl;
    writeSeqToStream(file, A);
    writeSeqToStream(file, B);
    file.close();
    return 0;
  }

  charstring readStringFromFile(char const *fileName) {
    ifstream file (fileName, ios::in | ios::binary | ios::ate);
    if (!file.is_open()) {
      std::cout << "Unable to open file: " << fileName << std::endl;
      abort();
    }
    long end = file.tellg();
    file.seekg (0, ios::beg);
    long n = end - file.tellg();
    charstring bytes(n, (char) 0);
    file.read (bytes.begin(), n);
    file.close();
    return bytes;
  }

  string intHeaderIO = "sequenceInt";

  template <class T>
  int writeIntSeqToFile(parlay::sequence<T> const &A, char const *fileName) {
    return writeSeqToFile(intHeaderIO, A, fileName);
  }

  sequence<sequence<char>> get_tokens(char const *fileName) {
    // parlay::internal::timer t("get_tokens");
    // auto S = parlay::chars_from_file(fileName);
    auto S = parlay::file_map(fileName);
    // t.next("file map");
    auto r =  parlay::tokens(S, benchIO::is_space);
    // t.next("tokens");
    return r;
  }

  template <class T>
  parlay::sequence<T> readIntSeqFromFile(char const *fileName) {
    auto W = get_tokens(fileName);
    string header(W[0].begin(),W[0].end());
    if (header != intHeaderIO) {
      cout << "readIntSeqFromFile: bad input" << endl;
      abort();
    }
    long n = W.size()-1;
    auto A = parlay::tabulate(n, [&] (long i) -> T {
	return parlay::chars_to_long(W[i+1]);});
    return A;
  }
};


================================================
FILE: algorithms/bench/common/MakeBench
================================================
# ********************
# GENERIC MAKEFILE FOR MOST BENCHMARKS THAT #include <name>.h
# USES FOLLOWING DEFINITIONS
#    BENCH : the name of the benchmark
#    REQUIRE : dependences
#    CC : the compiler
#    CFLAGS : compiler flags
#    LFLAGS : compiler link flags
# ********************

TIME = ../bench/$(BENCH)Time.C
CHECK = $(BENCH)Check
INCLUDE = 

all : $(BENCH) testInputs
	cd ../bench; make -s $(CHECK)

$(BENCH) : $(TIME) $(BENCH).h $(REQUIRE)
	$(CC) $(CFLAGS) $(INCLUDE) -include $(BENCH).h -o $(BENCH) $(TIME) $(LFLAGS)

testInputs : ../bench/testInputs ../bench/testInputs_small
	cp ../bench/testInputs ../bench/testInputs_small .

clean :
	rm -f $(BENCH)

cleanall : clean
	rm -f testInputs*; cd ../bench; make -s clean


================================================
FILE: algorithms/bench/common/MakeBenchLink
================================================
# ********************
# GENERIC MAKEFILE FOR MOST BENCHMARKS THAT LINK
# THE TIMING CODE WITH THE IMPLEMENTATION
# USES FOLLOWING DEFINITIONS
#    BENCH : the name of the benchmark
#    OBJS : implementation object files
#    REQUIRE : dependences for the object files
#    CC : the compiler
#    CFLAGS : compiler flags
#    LFLAGS : compiler link flags
# ********************

TIME = $(BENCH)Time
CHECK = ../bench/$(BENCH)Check
INCLUDE =

# Make benchmark
$(BENCH) : $(TIME).o $(OBJS) $(CHECK) testInputs
	$(CC) -o $@ $(TIME).o $(OBJS) $(LFLAGS)

# Timing Code
$(TIME).o : ../bench/$(TIME).C 
	$(CC) $(CFLAGS) $(INCLUDE) -o $@ -c ../bench/$(TIME).C
	
# The check code
$(CHECK) : $(CHECK).C
	cd ../bench; make -s $(BENCH)Check

# object files
%.o : %.C $(REQUIRE)
	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@

# copy over the generic test code
testInputs : ../bench/testInputs ../bench/testInputs_small
	cp ../bench/testInputs ../bench/testInputs_small .

clean :
	rm -f $(BENCH) *.o

cleanall : clean
	rm -f testInputs*; cd ../bench; make -s clean


================================================
FILE: algorithms/bench/common/atomics.h
================================================
#pragma once

namespace pbbs {

  template <typename ET>
  inline bool atomic_compare_and_swap(ET* a, ET oldval, ET newval) {
    static_assert(sizeof(ET) <= 8, "Bad CAS length");
    if (sizeof(ET) == 1) {
      uint8_t r_oval, r_nval;
      std::memcpy(&r_oval, &oldval, sizeof(ET));
      std::memcpy(&r_nval, &newval, sizeof(ET));
      return __sync_bool_compare_and_swap(reinterpret_cast<uint8_t*>(a), r_oval, r_nval);
    } else if (sizeof(ET) == 4) {
      uint32_t r_oval, r_nval;
      std::memcpy(&r_oval, &oldval, sizeof(ET));
      std::memcpy(&r_nval, &newval, sizeof(ET));
      return __sync_bool_compare_and_swap(reinterpret_cast<uint32_t*>(a), r_oval, r_nval);
    } else { // if (sizeof(ET) == 8) {
      uint64_t r_oval, r_nval;
      std::memcpy(&r_oval, &oldval, sizeof(ET));
      std::memcpy(&r_nval, &newval, sizeof(ET));
      return __sync_bool_compare_and_swap(reinterpret_cast<uint64_t*>(a), r_oval, r_nval);
    } 
  }

  template <typename E, typename EV>
  inline E fetch_and_add(E *a, EV b) {
    volatile E newV, oldV;
    do {oldV = *a; newV = oldV + b;}
    while (!atomic_compare_and_swap(a, oldV, newV));
    return oldV;
  }

  template <typename E, typename EV>
  inline void write_add(E *a, EV b) {
    //volatile E newV, oldV;
    E newV, oldV;
    do {oldV = *a; newV = oldV + b;}
    while (!atomic_compare_and_swap(a, oldV, newV));
  }

  template <typename E, typename EV>
  inline void write_add(std::atomic<E> *a, EV b) {
    //volatile E newV, oldV;
    E newV, oldV;
    do {oldV = a->load(); newV = oldV + b;}
    while (!std::atomic_compare_exchange_strong(a, &oldV, newV));
  }

  template <typename ET, typename F>
  inline bool write_min(ET *a, ET b, F less) {
    ET c; bool r=0;
    do c = *a;
    while (less(b,c) && !(r=atomic_compare_and_swap(a,c,b)));
    return r;
  }

  template <typename ET, typename F>
  inline bool write_min(std::atomic<ET> *a, ET b, F less) {
    ET c; bool r=0;
    do c = a->load();
    while (less(b,c) && !(r=std::atomic_compare_exchange_strong(a, &c, b)));
    return r;
  }

  template <typename ET, typename F>
  inline bool write_max(ET *a, ET b, F less) {
    ET c; bool r=0;
    do c = *a;
    while (less(c,b) && !(r=atomic_compare_and_swap(a,c,b)));
    return r;
  }

  template <typename ET, typename F>
  inline bool write_max(std::atomic<ET> *a, ET b, F less) {
    ET c; bool r=0;
    do c = a->load();
    while (less(c,b) && !(r=std::atomic_compare_exchange_strong(a, &c, b)));
    return r;
  }
}


================================================
FILE: algorithms/bench/common/dataGen.h
================================================
#pragma once
#include "../parlay/utilities.h"

namespace dataGen {

#define HASH_MAX_INT ((unsigned) 1 << 31)

  //#define HASH_MAX_LONG ((unsigned long) 1 << 63)

  template <class T> T hash(size_t i);
  
  template <>
  inline int hash<int>(size_t i) {
    return parlay::hash64(i) & ((((size_t) 1) << 31) - 1);}

  template <>
  inline long  hash<long>(size_t i) {
    return parlay::hash64(i) & ((((size_t) 1) << 63) - 1);}

  template <>
  inline unsigned int hash<unsigned int>(size_t i) {
    return parlay::hash64(i);}

  template <>
  inline size_t hash<size_t>(size_t i) {
    return parlay::hash64(i);}

  template <>
  inline double hash<double>(size_t i) {
    return ((double) hash<int>(i)/((double) ((((size_t) 1) << 31) - 1)));}

  template <>
  inline float hash<float>(size_t i) {
    return ((double) hash<int>(i)/((double) ((((size_t) 1) << 31) - 1)));}
};


================================================
FILE: algorithms/bench/common/geometry.h
================================================
#pragma once
#include <iostream>
#include <algorithm>
#include <math.h>
#include <iomanip>
#include "../parlay/parallel.h"
#include "../parlay/primitives.h"
using namespace std;

// *************************************************************
//    POINTS AND VECTORS (3d),  2d is below
// *************************************************************


  template <class Coord>
  class point3d;

  template <class Coord>
  class vector3d {
  public:
    using coord = Coord;
    using vector = vector3d;
    using point = point3d<coord>;
    coord x;
    coord y;
    coord z;
    vector3d(coord x, coord y, coord z) : x(x), y(y), z(z) {}
    vector3d() :x(0), y(0), z(0) {}
    vector3d(point p);
    vector3d(parlay::slice<coord*,coord*> p) : x(p[0]), y(p[1]), z(p[2]) {};
    vector operator+(vector op2) {
      return vector(x + op2.x, y + op2.y, z + op2.z);}
    vector operator-(vector op2) {
      return vector(x - op2.x, y - op2.y, z - op2.z);}
    point operator+(point op2);
    vector operator*(coord s) {return vector(x * s, y * s, z * s);}
    vector operator/(coord s) {return vector(x / s, y / s, z / s);}
    coord& operator[] (int i) {return (i==0) ? x : (i==1) ? y : z;}
    coord dot(vector v) {return x * v.x + y * v.y + z * v.z;}
    vector cross(vector v) {
      return vector(y*v.z - z*v.y, z*v.x - x*v.z, x*v.y - y*v.x);
    }
    coord maxDim() {return max(x,max(y,z));}
    void print() {cout << std::setprecision(10) << ":(" << x << "," << y << "," << z << "):";}
    coord Length(void) { return sqrt(x*x+y*y+z*z);}
    coord sqLength(void) { return x*x+y*y+z*z;}
    static const int dim = 3;
  };

  template <class Coord>
  class point3d {
  public:
    using coord = Coord;
    using vector = vector3d<coord>;
    using point = point3d;
    coord x; coord y; coord z;
    int dimension() {return 3;}
    point3d(coord x, coord y, coord z) : x(x), y(y), z(z) {}
    point3d() : x(0), y(0), z(0) {}
    point3d(vector v) : x(v.x), y(v.y), z(v.z) {};
    point3d(parlay::slice<coord*,coord*> p) : x(p[0]), y(p[1]), z(p[2]) {};
    void print() {cout << ":(" << x << "," << y << "," << z << "):";}
    vector operator-(point op2) {
      return vector(x - op2.x, y - op2.y, z - op2.z);}
    point operator+(vector op2) {
      return point(x + op2.x, y + op2.y, z + op2.z);}
    point minCoords(point b) {
      return point(min(x,b.x),min(y,b.y),min(z,b.z)); }
    point maxCoords(point b) { 
      return point(max(x,b.x),max(y,b.y),max(z,b.z)); }
    coord& operator[] (int i) {return (i==0) ? x : (i==1) ? y : z;}
    int quadrant(point center) {
      int index = 0;
      if (x > center.x) index += 1;
      if (y > center.y) index += 2;
      if (z > center.z) index += 4;
      return index;
    }
    // returns a point offset by offset in one of 8 directions 
    // depending on dir (an integer from [0..7])
    point offsetPoint(int dir, coord offset) {
      coord xx = x + ((dir & 1) ? offset : -offset);
      coord yy = y + ((dir & 2) ? offset : -offset);
      coord zz = z + ((dir & 4) ? offset : -offset);
      return point(xx, yy, zz);
    }
    point changeCoords(std::vector<coord> v){
      return point(v[0], v[1], v[2]);
    }
    // checks if pt is outside of a box centered at this point with
    // radius hsize
    bool outOfBox(point pt, coord hsize) { 
      return ((x - hsize > pt.x) || (x + hsize < pt.x) ||
	      (y - hsize > pt.y) || (y + hsize < pt.y) ||
	      (z - hsize > pt.z) || (z + hsize < pt.z));
    }
    static const int dim = 3;
  };

  template <class coord>
  inline point3d<coord> vector3d<coord>::operator+(point3d<coord> op2) {
    return point3d<coord>(x + op2.x, y + op2.y, z + op2.z);}

  template <class coord>
  inline vector3d<coord>::vector3d(point3d<coord> p) { x = p.x; y = p.y; z = p.z;}

  // *************************************************************
  //    POINTS AND VECTORS (2d)
  // *************************************************************

  template <class Coord>
  class point2d;

  template <class Coord>
  class vector2d {
  public: 
    using coord = Coord;
    using point = point2d<coord>;
    using vector = vector2d;
    coord x; coord y;
    vector2d(coord x, coord y) : x(x), y(y) {}
    vector2d() : x(0), y(0)  {}
    vector2d(point p);
    vector2d(parlay::slice<coord*,coord*> p) : x(p[0]), y(p[1]) {};
    vector operator+(vector op2) {return vector(x + op2.x, y + op2.y);}
    vector operator-(vector op2) {return vector(x - op2.x, y - op2.y);}
    point operator+(point op2);
    vector operator*(coord s) {return vector(x * s, y * s);}
    vector operator/(coord s) {return vector(x / s, y / s);}
    coord operator[] (int i) {return (i==0) ? x : y;};
    coord dot(vector v) {return x * v.x + y * v.y;}
    coord cross(vector v) { return x*v.y - y*v.x; }  
    coord maxDim() {return max(x,y);}
    void print() {cout << ":(" << x << "," << y << "):";}
    coord Length(void) { return sqrt(x*x+y*y);}
    coord sqLength(void) { return x*x+y*y;}
    static const int dim = 2;
  };

  template <class coord>
  static std::ostream& operator<<(std::ostream& os, const vector3d<coord> v) {
    return os << v.x << " " << v.y << " " << v.z; }

  template <class coord>
  static std::ostream& operator<<(std::ostream& os, const point3d<coord> v) {
    return os << v.x << " " << v.y << " " << v.z;
  }

  template <class Coord>
  class point2d {
  public: 
    using coord = Coord;
    using vector = vector2d<coord>;
    using point = point2d;
    coord x; coord y; 
    int dimension() {return 2;}
    point2d(coord x, coord y) : x(x), y(y) {}
    point2d() : x(0), y(0) {}
    point2d(vector v) : x(v.x), y(v.y) {};
    point2d(parlay::slice<coord*,coord*> p) : x(p[0]), y(p[1]) {};
    void print() {cout << ":(" << x << "," << y << "):";}
    vector operator-(point op2) {return vector(x - op2.x, y - op2.y);}
    point operator+(vector op2) {return point(x + op2.x, y + op2.y);}
    coord operator[] (int i) {return (i==0) ? x : y;};
    point minCoords(point b) { return point(min(x,b.x),min(y,b.y)); }
    point maxCoords(point b) { return point(max(x,b.x),max(y,b.y)); }
    int quadrant(point center) {
      int index = 0;
      if (x > center.x) index += 1;
      if (y > center.y) index += 2;
      return index;
    }
    // returns a point offset by offset in one of 4 directions 
    // depending on dir (an integer from [0..3])
    point offsetPoint(int dir, coord offset) {
      coord xx = x + ((dir & 1) ? offset : -offset);
      coord yy = y + ((dir & 2) ? offset : -offset);
      return point(xx,yy);
    }
    bool outOfBox(point pt, coord hsize) { 
      return ((x - hsize > pt.x) || (x + hsize < pt.x) ||
	      (y - hsize > pt.y) || (y + hsize < pt.y));
    }
    static const int dim = 2;
  };

  template <class coord>
  inline point2d<coord> vector2d<coord>::operator+(point2d<coord> op2) {
    return point2d<coord>(x + op2.x, y + op2.y);}

  template <class coord>
  inline vector2d<coord>::vector2d(point2d<coord> p) { x = p.x; y = p.y;}

  template <class coord>
  static std::ostream& operator<<(std::ostream& os, const vector2d<coord> v) {
    return os << v.x << " " << v.y;}

  template <class coord>
  static std::ostream& operator<<(std::ostream& os, const point2d<coord> v) {
    return os << v.x << " " << v.y; }

  // *************************************************************
  //    GEOMETRY
  // *************************************************************

  // Returns twice the area of the oriented triangle (a, b, c)
  template <class coord>
  inline coord triArea(point2d<coord> a, point2d<coord> b, point2d<coord> c) {
    return (b-a).cross(c-a);
  }

  template <class coord>
  inline coord triAreaNormalized(point2d<coord> a, point2d<coord> b, point2d<coord> c) {
    return triArea(a,b,c)/((b-a).Length()*(c-a).Length());
  }

  // Returns TRUE if the points a, b, c are in a counterclockise order
  template <class coord>
  inline bool counterClockwise(point2d<coord> a, point2d<coord> b, point2d<coord> c) {
    return (b-a).cross(c-a) > 0.0;
  }

  template <class coord>
  inline vector3d<coord> onParabola(vector2d<coord> v) {
    return vector3d<coord>(v.x, v.y, v.x*v.x + v.y*v.y);}

  // Returns TRUE if the point d is inside the circle defined by the
  // points a, b, c. 
  // Projects a, b, c onto a parabola centered with d at the origin
  //   and does a plane side test (tet volume > 0 test)
  template <class coord>
  inline bool inCircle(point2d<coord> a, point2d<coord> b, 
		       point2d<coord> c, point2d<coord> d) {
    vector3d<coord> ad = onParabola(a-d);
    vector3d<coord> bd = onParabola(b-d);
    vector3d<coord> cd = onParabola(c-d);
    return (ad.cross(bd)).dot(cd) > 0.0;
  }

  // returns a number between -1 and 1, such that -1 is out at infinity,
  // positive numbers are on the inside, and 0 is at the boundary
  template <class coord>
  inline double inCircleNormalized(point2d<coord> a, point2d<coord> b, 
				   point2d<coord> c, point2d<coord> d) {
    vector3d<coord> ad = onParabola(a-d);
    vector3d<coord> bd = onParabola(b-d);
    vector3d<coord> cd = onParabola(c-d);
    return (ad.cross(bd)).dot(cd)/(ad.Length()*bd.Length()*cd.Length());
  }

  // *************************************************************
  //    TRIANGLES
  // *************************************************************

  using tri = std::array<int,3>;

  template <class point>
  struct triangles {
    size_t numPoints() {return P.size();};
    size_t numTriangles() {return T.size();}
    parlay::sequence<point> P;
    parlay::sequence<tri> T;
    triangles() {}
    triangles(parlay::sequence<point> P, parlay::sequence<tri> T) 
      : P(std::move(P)), T(std::move(T)) {}
  };

  template <class point>
  struct ray {
    using vector = typename point::vector;
    point o;
    vector d;
    ray(point _o, vector _d) : o(_o), d(_d) {}
    ray() {}
  };

  template<class coord>
  inline coord angle(point2d<coord> a, point2d<coord> b, point2d<coord> c) {
    vector2d<coord> ba = (b-a);
    vector2d<coord> ca = (c-a);
    coord lba = ba.Length();
    coord lca = ca.Length();
    coord pi = 3.14159;
    return 180/pi*acos(ba.dot(ca)/(lba*lca));
  }

  template<class coord>
  inline coord minAngleCheck(point2d<coord> a, point2d<coord> b, point2d<coord> c, coord angle) {
    vector2d<coord> ba = (b-a);
    vector2d<coord> ca = (c-a);
    vector2d<coord> cb = (c-b);
    coord lba = ba.Length();
    coord lca = ca.Length();
    coord lcb = cb.Length();
    coord pi = 3.14159;
    coord co = cos(angle*pi/180.);
    return (ba.dot(ca)/(lba*lca) > co || ca.dot(cb)/(lca*lcb) > co || 
	    -ba.dot(cb)/(lba*lcb) > co);
  }

  template<class coord>
  inline point2d<coord> triangleCircumcenter(point2d<coord> a, point2d<coord> b, point2d<coord> c) {
    vector2d<coord> v1 = b-a;
    vector2d<coord> v2 = c-a;
    vector2d<coord> v11 = v1 * v2.dot(v2);
    vector2d<coord> v22 = v2 * v1.dot(v1);
    return a + vector2d<coord>(v22.y - v11.y, v11.x - v22.x)/(2.0 * v1.cross(v2));
  }


================================================
FILE: algorithms/bench/common/geometryIO.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once
#include "../parlay/parallel.h"
#include "../parlay/primitives.h"
#include "geometry.h"
#include "IO.h"

//using namespace geometry;
using namespace benchIO;

  template <class coord>
  inline int xToStringLen(point2d<coord> a) { 
    return xToStringLen(a.x) + xToStringLen(a.y) + 1;
  }

  template <class coord>
  inline void xToString(char* s, point2d<coord> a) { 
    int l = xToStringLen(a.x);
    xToString(s, a.x);
    s[l] = ' ';
    xToString(s+l+1, a.y);
  }

  template <class coord>
  inline int xToStringLen(point3d<coord> a) { 
    return xToStringLen(a.x) + xToStringLen(a.y) + xToStringLen(a.z) + 2;
  }

  template <class coord>
  inline void xToString(char* s, point3d<coord> a) { 
    int lx = xToStringLen(a.x);
    int ly = xToStringLen(a.y);
    xToString(s, a.x);
    s[lx] = ' ';
    xToString(s+lx+1, a.y);
    s[lx+ly+1] = ' ';
    xToString(s+lx+ly+2, a.z);
  }

  // inline int xToStringLen(tri a) { 
  //   return xToStringLen(a[0]) + xToStringLen(a[1]) + xToStringLen(a[2]) + 2;
  // }

  // inline void xToString(char* s, tri a) { 
  //   int lx = xToStringLen(a[0]);
  //   int ly = xToStringLen(a[1]);
  //   xToString(s, a[0]);
  //   s[lx] = ' ';
  //   xToString(s+lx+1, a[1]);
  //   s[lx+ly+1] = ' ';
  //   xToString(s+lx+ly+2, a[2]);
  // }

namespace benchIO {
  using namespace std;

  string HeaderPoint2d = "pbbs_sequencePoint2d";
  string HeaderPoint3d = "pbbs_sequencePoint3d";
  string HeaderTriangles = "pbbs_triangles";

  template <class Point>
    int writePointsToFile(parlay::sequence<Point> const &P, char const *fname) {
    string Header = (Point::dim == 2) ? HeaderPoint2d : HeaderPoint3d;
    int r = writeSeqToFile(Header, P, fname);
    return r;
  }

  template <class Point, class Seq>
  parlay::sequence<Point> parsePoints(Seq W) {
    using coord = typename Point::coord;
    int d = Point::dim;
    size_t n = W.size()/d;
    auto a = parlay::tabulate(d * n, [&] (size_t i) -> coord {
	return atof(W[i]);});
    auto points = parlay::tabulate(n, [&] (size_t i) -> Point {
	return Point(a.cut(d*i,d*(i + 1)));});
    return points;
  }

  template <class Point>
  parlay::sequence<Point> readPointsFromFile(char const *fname) {
    parlay::sequence<char> S = readStringFromFile(fname);
    parlay::sequence<char*> W = stringToWords(S);
    int d = Point::dim;
    if (W.size() == 0 || W[0] != (d == 2 ? HeaderPoint2d : HeaderPoint3d)) {
      cout << "readPointsFromFile wrong file type" << endl;
      abort();
    }
    return parsePoints<Point>(W.cut(1,W.size()));
  }

  // triangles<point2d> readTrianglesFromFileNodeEle(char const *fname) {
  //   string nfilename(fname);
  //   _seq<char> S = readStringFromFile((char*)nfilename.append(".node").c_str());
  //   words W = stringToWords(S.A, S.n);
  //   triangles<point2d> Tr;
  //   Tr.numPoints = atol(W.Strings[0]);
  //   if (W.m < 4*Tr.numPoints + 4) {
  //     cout << "readStringFromFileNodeEle inconsistent length" << endl;
  //     abort();
  //   }

  //   Tr.P = newA(point2d, Tr.numPoints);
  //   for(intT i=0; i < Tr.numPoints; i++) 
  //     Tr.P[i] = point2d(atof(W.Strings[4*i+5]), atof(W.Strings[4*i+6]));

  //   string efilename(fname);
  //   _seq<char> SN = readStringFromFile((char*)efilename.append(".ele").c_str());
  //   words WE = stringToWords(SN.A, SN.n);
  //   Tr.numTriangles = atol(WE.Strings[0]);
  //   if (WE.m < 4*Tr.numTriangles + 3) {
  //     cout << "readStringFromFileNodeEle inconsistent length" << endl;
  //     abort();
  //   }

  //   Tr.T = newA(triangle, Tr.numTriangles);
  //   for (long i=0; i < Tr.numTriangles; i++)
  //     for (int j=0; j < 3; j++)
  // 	Tr.T[i].C[j] = atol(WE.Strings[4*i + 4 + j]);

  //   return Tr;
  // }

  template <class pointT>
  triangles<pointT> readTrianglesFromFile(char const *fname, int offset) {
    int d = pointT::dim;
    parlay::sequence<char> S = readStringFromFile(fname);
    parlay::sequence<char*> W = stringToWords(S);
    if (W[0] != HeaderTriangles) {
      cout << "readTrianglesFromFile wrong file type" << endl;
      abort();
    }

    int headerSize = 3;
    size_t n = atol(W[1]);
    size_t m = atol(W[2]);
    if (W.size() != headerSize + 3 * m + d * n) {
      cout << "readTrianglesFromFile inconsistent length" << endl;
      abort();
    }

    auto pts_slice = W.cut(headerSize, headerSize + d * n);
    auto tri_slice = W.cut(headerSize + d * n, W.size());
    parlay::sequence<pointT> Pts = parsePoints<pointT>(pts_slice);
    auto Tri = parlay::tabulate(m, [&] (size_t i ) -> tri {
				     return {(int) atol(tri_slice[3*i])-offset,
					     (int) atol(tri_slice[3*i+1])-offset,
					     (int) atol(tri_slice[3*i+2])-offset};});
    return triangles<pointT>(Pts,Tri);
  }

  template <class pointT>
  int writeTrianglesToFile(triangles<pointT> Tr, char* fileName) {
    ofstream file (fileName, ios::binary);
    if (!file.is_open()) {
      std::cout << "Unable to open file: " << fileName << std::endl;
      return 1;
    }
    file << HeaderTriangles << endl;
    file << Tr.numPoints() << endl; 
    file << Tr.numTriangles() << endl; 
    writeSeqToStream(file, Tr.P);
    //writeSeqToStream(file, Tr.T);
    auto A = parlay::tabulate(3*Tr.numTriangles(), [&] (size_t i) -> int {
      						     return (Tr.T[i/3])[i%3];});
    writeSeqToStream(file, A);
    file.close();
    return 0;
  }

};


================================================
FILE: algorithms/bench/common/get_time.h
================================================
#pragma once

#include <stdlib.h>
#include <sys/time.h>
#include <iomanip>
#include <iostream>
#include <string>

struct timer {
  double total_time;
  double last_time;
  bool on;
  std::string name;
  struct timezone tzp;

  timer(std::string name = "PBBS time", bool _start = true)
  : total_time(0.0), on(false), name(name), tzp({0,0}) {
    if (_start) start();
  }

  double get_time() {
    timeval now;
    gettimeofday(&now, &tzp);
    return ((double) now.tv_sec) + ((double) now.tv_usec)/1000000.;
  }

  void start () {
    on = 1;
    last_time = get_time();
  }

  double stop () {
    on = 0;
    double d = (get_time()-last_time);
    total_time += d;
    return d;
  }

  void reset() {
     total_time=0.0;
     on=0;
  }

  double get_total() {
    if (on) return total_time + get_time() - last_time;
    else return total_time;
  }

  double get_next() {
    if (!on) return 0.0;
    double t = get_time();
    double td = t - last_time;
    total_time += td;
    last_time = t;
    return td;
  }

  void report(double time, std::string str) {
    std::ios::fmtflags cout_settings = std::cout.flags();
    std::cout.precision(4);
    std::cout << std::fixed;
    std::cout << name << ": ";
    if (str.length() > 0)
      std::cout << str << ": ";
    std::cout << time << std::endl;
    std::cout.flags(cout_settings);
  }

  void total() {
    report(get_total(),"total");
    total_time = 0.0;
  }

  void reportTotal(std::string str) {
    report(get_total(), str);
  }

  void next(std::string str) {
    if (on) report(get_next(), str);
  }
};

static timer _tm;
#define startTime() _tm.start();
#define nextTime(_string) _tm.next(_string);


================================================
FILE: algorithms/bench/common/glue.h
================================================
#pragma once

#include "../pbbslib/hash_table.h"
#include "../pbbslib/integer_sort.h"

using intT = int;
using uintT = unsigned int;

#define newA(__E,__n) (__E*) malloc((__n)*sizeof(__E))

namespace utils {

  static void myAssert(int cond, std::string s) {
    if (!cond) {
      std::cout << s << std::endl;
      abort();
    }
  }

  inline unsigned int hash(unsigned int a)
  {
    a = (a+0x7ed55d16) + (a<<12);
    a = (a^0xc761c23c) ^ (a>>19);
    a = (a+0x165667b1) + (a<<5);
    a = (a+0xd3a2646c) ^ (a<<9);
    a = (a+0xfd7046c5) + (a<<3);
    a = (a^0xb55a4f09) ^ (a>>16);
    return a;
  }

  inline int hashInt(unsigned int a) {  
    return hash(a) & (((unsigned) 1 << 31) - 1);
  }

  // template <class E>
  // struct identityF { E operator() (const E& x) {return x;}};

  // template <class E>
  // struct addF { E operator() (const E& a, const E& b) const {return a+b;}};

  // template <class E>
  // struct absF { E operator() (const E& a) const {return std::abs(a);}};

  // template <class E>
  // struct zeroF { E operator() (const E& a) const {return 0;}};

  // template <class E>
  // struct maxF { E operator() (const E& a, const E& b) const {return (a>b) ? a : b;}};

  // template <class E>
  // struct minF { E operator() (const E& a, const E& b) const {return (a<b) ? a : b;}};

  // template <class E1, class E2>
  // struct firstF {E1 operator() (std::pair<E1,E2> a) {return a.first;} };

  // template <class E1, class E2>
  // struct secondF {E2 operator() (std::pair<E1,E2> a) {return a.second;} };

}

// template <class T>
// struct _seq {
//   T* A;
//   long n;
//   _seq() {A = NULL; n=0;}
//   _seq(T* _A, long _n) : A(_A), n(_n) {}
//   void del() {free(A);}
// };

// namespace osequence {

//   template <class ET, class intT, class F>
//   ET scan(ET *In, ET* Out, intT n, F f, ET zero) {
//     if (In == Out)
//       return pbbs::scan_inplace(pbbs::range<ET*>(In,In+n),pbbs::make_monoid(f,zero));
//     else {
//       std::cout << "NYI in scan" << std::endl;
//       return zero;
//     }
//   }

//   template <class ET>
//   ET plusScan(ET *In, ET* Out, size_t n) {
//     return scan(In, Out, n, [&] (ET a, ET b) {return a + b;}, (ET) 0);
//   }
  
//   template <class ET, class PRED>
//   size_t filter(ET* In, ET* Out, size_t n, PRED p) {
//     pbbs::sequence<ET> r = pbbs::filter(pbbs::range<ET*>(In,In+n), p);
//     parallel_for(0, r.size(), [&] (size_t i) {Out[i] = r[i];});
//     return r.size();
//   }

// };

namespace dataGen {

  using namespace std;

#define HASH_MAX_INT ((unsigned) 1 << 31)

  //#define HASH_MAX_LONG ((unsigned long) 1 << 63)

  template <class T> T hash(intT i);
  
  template <>
  inline intT hash<intT>(intT i) {
    return utils::hash(i) & (HASH_MAX_INT-1);}

  template <>
  inline uintT hash<uintT>(intT i) {
    return utils::hash(i);}

  template <>
  inline double hash<double>(intT i) {
    return ((double) hash<intT>(i)/((double) HASH_MAX_INT));}

};

// template <class HASH, class ET>
// _seq<ET> removeDuplicates(_seq<ET> S, HASH hashF) {
//   return pbbs::remove_duplicates(pbbs::range<ET*>(S.A, S.A+S.n), hashF);
// }


================================================
FILE: algorithms/bench/common/graph.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <iostream>
#include <algorithm>
#include "../parlay/parallel.h"
#include "../parlay/primitives.h"

// IntV and IntE should be set depending on the size of the graphs
//  intV should have enough range to represent |V|
//  intE should have enough range to represent |E|
//  intE defaults to intV if not specified
using DefaultIntV = int;
using DefaultWeight = float;

// **************************************************************
//    EDGE ARRAY REPRESENTATION
// **************************************************************

template <class intV = DefaultIntV>
struct edge {
  intV u;
  intV v;
  edge() {}
  edge(intV f, intV s) : u(f), v(s) {}
};

template <class intV = DefaultIntV>
struct edgeArray {
  parlay::sequence<edge<intV>> E;
  size_t numRows;
  size_t numCols;
  size_t nonZeros;
  edgeArray(parlay::sequence<edge<intV>> EE, size_t r, size_t c) :
    E(std::move(EE)), numRows(r), numCols(c), nonZeros(E.size()) {}
  edgeArray() {}
  edge<intV> operator[] (const size_t i) const {return E[i];}
};

// **************************************************************
//    WEIGHED EDGE ARRAY
// **************************************************************

template <class intV = DefaultIntV, class Weight=DefaultWeight>
struct wghEdge {
  intV u, v;
  Weight weight;
  wghEdge() {}
  wghEdge(intV _u, intV _v, Weight w) : u(_u), v(_v), weight(w) {}
};

template <class intV = DefaultIntV, class Weight=DefaultWeight>
struct wghEdgeArray {
  using W = Weight;
  parlay::sequence<wghEdge<intV,W>> E;
  size_t n; size_t m;
  wghEdgeArray(parlay::sequence<wghEdge<intV,W>> E_, intV n) 
    : E(std::move(E_)), n(n), m(E.size()) {}
  wghEdgeArray() {}
  wghEdge<intV> operator[] (const size_t i) const {return E[i];}
};

// **************************************************************
//    ADJACENCY ARRAY REPRESENTATION
// **************************************************************

template <class intV = DefaultIntV>
struct vertex {
  const intV* Neighbors;
  intV degree;
  vertex(const intV* N, const intV d) : Neighbors(N), degree(d) {}
  vertex() : Neighbors(NULL), degree(0) {}
};

template <class intV = DefaultIntV>
struct mod_vertex {
  intV* Neighbors;
  intV degree;
  mod_vertex(intV* N, intV d) : Neighbors(N), degree(d) {}
  mod_vertex() : Neighbors(NULL), degree(0) {}
};

template <class intV = DefaultIntV, class intE = intV>
struct graph {
  using vertexId = intV;
  using edgeId = intE;
  using MVT = mod_vertex<intV>;
  using VT = vertex<intV>;
  parlay::sequence<intE> offsets;
  parlay::sequence<intV> edges;
  parlay::sequence<intV> degrees; // not always used
  size_t n;
  size_t m;
  size_t numVertices() const {return n;}
  size_t numEdges() const {
    if (degrees.size() == 0) return m;
    else {
      std::cout << "hello numEdges" << std::endl;
      auto dgs = parlay::delayed_seq<intE>(n, [&] (size_t i) {
	  return degrees[i];});
      return parlay::reduce(dgs, parlay::addm<intE>());
    }
  }

  const parlay::sequence<intE>& get_offsets() const {
    return offsets;
  }

  void addDegrees() {
    degrees = parlay::tabulate(n, [&] (size_t i) -> intV {
	return offsets[i+1] - offsets[i];});
  }

  MVT operator[] (const size_t i) {
    return MVT(edges.data() + offsets[i],
	       (degrees.size() == 0)
	       ? offsets[i+1] - offsets[i] : degrees[i]);}

  const VT operator[] (const size_t i) const {
    return VT(edges.data() + offsets[i],
	      (degrees.size() == 0)
	      ? offsets[i+1] - offsets[i] : degrees[i]);
  }
  
  graph(parlay::sequence<intE> offsets_,
	parlay::sequence<intV> edges_,
	size_t n) 
    : offsets(std::move(offsets_)), edges(std::move(edges_)), n(n), m(edges.size()) {
    if (offsets.size() != n + 1) { std::cout << "error in graph constructor" << std::endl;}
  }
};

// **************************************************************
//    WEIGHTED ADJACENCY ARRAY REPRESENTATION
// **************************************************************

template <class intV = DefaultIntV, class Weight = DefaultWeight>
struct wghVertex {
  intV* Neighbors;
  intV degree;
  Weight* nghWeights;
  wghVertex(intV* N, Weight* W, intV d) : Neighbors(N), nghWeights(W), degree(d) {}
};

template <class intV = DefaultIntV, class Weight=DefaultWeight,
          class intE = intV>
struct wghGraph {
  using VT = wghVertex<intV,Weight>;
  using W = Weight;
  parlay::sequence<intE> offsets;
  parlay::sequence<intV> edges;
  parlay::sequence<Weight> weights;
  size_t n;
  size_t m;
  size_t numVertices() const {return n;}
  size_t numEdges() const {return m;}
  //const parlay::sequence<intV>& get_offsets() const {
  //  return offsets;
  //}
  parlay::sequence<intV> get_offsets() {
    return offsets;
  }
  VT operator[] (const size_t i) {
    return VT(edges.begin() + offsets[i],
	      weights.begin() + offsets[i],
	      offsets[i+1] - offsets[i]);}

wghGraph(parlay::sequence<intE> offsets_,
	 parlay::sequence<intV> edges_,
	 parlay::sequence<Weight> weights_,
	   size_t n) 
    : offsets(std::move(offsets_)), edges(std::move(edges_)),
      weights(std::move(weights_)), n(n), m(edges.size()) {
    if (offsets.size() != n + 1 || weights.size() != edges.size()) {
      std::cout << "error in weighted graph constructor" << std::endl;}
  }
};

template <typename intV>
struct FlowGraph {
  wghGraph<intV> g;
  intV source, sink;
  FlowGraph(wghGraph<intV> g, intV source, intV sink)
    : g(g), source(source), sink(sink) {}
  FlowGraph copy() {
    return FlowGraph(g.copy(), source, sink);
  }
  void del() { g.del(); }
};


================================================
FILE: algorithms/bench/common/graphIO.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2010 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once
#include <iostream>
#include <stdint.h>
#include <cstring>
#include "../parlay/parallel.h"
#include "IO.h"
#include "graphUtils.h"

#include <sys/mman.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

using namespace benchIO;

template <class intV>
int xToStringLen(edge<intV> a) {
  return xToStringLen(a.u) + xToStringLen(a.v) + 1;
}

template <class intV>
void xToString(char* s, edge<intV> a) {
  int l = xToStringLen(a.u);
  xToString(s, a.u);
  s[l] = ' ';
  xToString(s+l+1, a.v);
}

template <class intV, class Weight>
int xToStringLen(wghEdge<intV,Weight> a) {
  return xToStringLen(a.u) + xToStringLen(a.v) + xToStringLen(a.weight) + 2;
}

template <class intV, class Weight>
void xToString(char* s, wghEdge<intV, Weight> a) {
  int lu = xToStringLen(a.u);
  int lv = xToStringLen(a.v);
  xToString(s, a.u);
  s[lu] = ' ';
  xToString(s+lu+1, a.v);
  s[lu+lv+1] = ' ';
  xToString(s+lu+lv+2, a.weight);
}

namespace benchIO {
  using namespace std;

  string AdjGraphHeader = "AdjacencyGraph";
  string EdgeArrayHeader = "EdgeArray";
  string WghEdgeArrayHeader = "WeightedEdgeArray";
  string WghAdjGraphHeader = "WeightedAdjacencyGraph";

  template <class intV, class intE>
  int writeGraphToFile(graph<intV, intE> const &G, char* fname) {
    if (G.degrees.size() > 0) {
      graph<intV, intE> GP = packGraph(G);
      return writeGraphToFile(GP, fname);
    }
    size_t m = G.numEdges();
    size_t n = G.numVertices();
    size_t totalLen = 2 + n + m;
    parlay::sequence<size_t> Out(totalLen);
    Out[0] = n;
    Out[1] = m;

    // write offsets to Out[2,..,2+n)
    parlay::sequence<intE> const &offsets = G.get_offsets();
    parlay::parallel_for (0, n, [&] (size_t i) {
    	Out[i+2] = offsets[i];});

    // write out edges to Out[2+n,..,2+n+m)
    parlay::parallel_for(0, n, [&] (size_t i) {
    	size_t o = offsets[i] + 2 + n;
    	for (intV j = 0; j < G[i].degree; j++) 
    	  Out[o + j] = G[i].Neighbors[j];
      });

    int r = writeSeqToFile(AdjGraphHeader, Out, fname);
    return r;
  }

  template <class intV, class Weight, class intE>
  int writeWghGraphToFile(wghGraph<intV,Weight,intE> G, char* fname) {
    size_t m = G.m;
    size_t n = G.n;
    // weights have to separate since they could be floats
    parlay::sequence<size_t> Out1(2 + n + m);
    parlay::sequence<Weight> Out2(m);
    Out1[0] = n;
    Out2[1] = m;

    // write offsets to Out[2,..,2+n)
    auto offsets = G.get_offsets();
    parlay::parallel_for (0, n, [&] (size_t i) {
	Out1[i+2] = offsets[i];});

    // write out edges to Out1[2+n,..,2+n+m)
    // and weights to Out2[0,..,m)
    parlay::parallel_for(0, n, [&] (size_t i) {
	size_t o = offsets[i];
	wghVertex<intV,Weight> v = G[i];
	for (intV j = 0; j < v.degree; j++) {
	  Out1[2 + n + o + j] = v.Neighbors[j];
	  Out2[o + j] = v.nghWeights[j]; }
      });
    int r = write2SeqToFile(WghAdjGraphHeader, Out1, Out2, fname);
    return r;
  }

  template <class intV>
  int writeEdgeArrayToFile(edgeArray<intV> const &EA, char* fname) {
    return writeSeqToFile(EdgeArrayHeader, EA.E, fname);
  }

  template <class intV, class intE>
  int writeWghEdgeArrayToFile(wghEdgeArray<intV,intE>
			      const &EA, char* fname) {
    return writeSeqToFile(WghEdgeArrayHeader, EA.E, fname);
  }

  template <class intV>
  edgeArray<intV> readEdgeArrayFromFile(char* fname) {
    parlay::sequence<char> S = readStringFromFile(fname);
    parlay::sequence<char*> W = stringToWords(S);
    if (W[0] != EdgeArrayHeader) {
      cout << "Bad input file" << endl;
      abort();
    }
    long n = (W.size()-1)/2;
    auto E = parlay::tabulate(n, [&] (long i) -> edge<intV> {
	return edge<intV>(atol(W[2*i + 1]),
			  atol(W[2*i + 2]));});

    auto mon = parlay::make_monoid([&] (edge<intV> a, edge<intV> b) {
	return edge<intV>(std::max(a.u, b.u), std::max(a.v, b.v));},
      edge<intV>(0,0));
    auto r = parlay::reduce(E, mon);

    intV maxrc = std::max(r.u, r.v) + 1;
    return edgeArray<intV>(std::move(E), maxrc, maxrc);
  }

  template <class intV, class Weight>
  wghEdgeArray<intV,Weight> readWghEdgeArrayFromFile(char* fname) {
    using WE = wghEdge<intV,Weight>;
    parlay::sequence<char> S = readStringFromFile(fname);
    parlay::sequence<char*> W = stringToWords(S);
    if (W[0] != WghEdgeArrayHeader) {
      cout << "Bad input file" << endl;
      abort();
    }
    long n = (W.size()-1)/3;
    auto E = parlay::tabulate(n, [&] (size_t i) -> WE {
	return WE(atol(W[3*i + 1]),
		  atol(W[3*i + 2]),
		  (Weight) atof(W[3*i + 3]));});

    auto mon = parlay::make_monoid([&] (WE a, WE b) {
	return WE(std::max(a.u, b.u), std::max(a.v, b.v), 0);},
      WE(0,0,0));
    auto r = parlay::reduce(E, mon);

    return wghEdgeArray<intV,Weight>(std::move(E), max<intV>(r.u, r.v) + 1);
  }

  template <class intV, class intE=intV>
  graph<intV, intE> readGraphFromFile(char* fname) {
    auto W = get_tokens(fname);
    string header(W[0].begin(), W[0].end());
    if (header != AdjGraphHeader) {
      cout << "Bad input file: missing header: " << AdjGraphHeader << endl;
      abort();
    }

    // file consists of [type, num_vertices, num_edges, <vertex offsets>, <edges>]
    // in compressed sparse row format
    long n = parlay::chars_to_long(W[1]);
    long m = parlay::chars_to_long(W[2]);
    if (W.size() != n + m + 3) {
      cout << "Bad input file: length = "<< W.size() << " n+m+3 = " << n+m+3 << endl;
      abort(); }
    
    // tags on m at the end (so n+1 total offsets)
    auto offsets = parlay::tabulate(n+1, [&] (size_t i) -> intE {
	return (i == n) ? m : parlay::chars_to_long(W[i+3]);});
    auto edges = parlay::tabulate(m, [&] (size_t i) -> intV {
	return parlay::chars_to_long(W[n+i+3]);});

    return graph<intV, intE>(std::move(offsets), std::move(edges), n);
  }

  // parlay::sequence<char> mmapStringFromFile(const char *filename) {
  //   struct stat sb;
  //   int fd = open(filename, O_RDONLY);
  //   if (fd == -1) {
  //     perror("open");
  //     exit(-1);
  //   }
  //   if (fstat(fd, &sb) == -1) {
  //     perror("fstat");
  //     exit(-1);
  //   }
  //   if (!S_ISREG (sb.st_mode)) {
  //     perror("not a file\n");
  //     exit(-1);
  //   }
  //   char *p = static_cast<char*>(mmap(0, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0));
  //   if (p == MAP_FAILED) {
  //     perror("mmap");
  //     exit(-1);
  //   }
  //   if (close(fd) == -1) {
  //     perror("close");
  //     exit(-1);
  //   }
  //   size_t n = sb.st_size;
  //   return parlay::sequence<char>(p, n); // Yikes!
  // }

  // template <class intV, class intV>
  // graphC<intV, intV> readGraphCFromFile(char* fname, bool mmap=false) {

  //   parlay::sequence<char*> W;
  //   if (mmap) {
  //     cout << "mmapping file" << endl;
  //     parlay::sequence<char> S = mmapStringFromFile(fname);
  //     // copy to new sequence
  //     parlay::sequence<char> bytes = S;
  //     // and unmap
  //     if (munmap(S.begin(), S.size()) == -1) {
  //       perror("munmap");
  //       exit(-1);
  //     }
  //     W = stringToWords(S);
  //     cout << "mmap'd" << endl;
  //   } else {
  //     auto S = readStringFromFile(fname);
  //     W = stringToWords(S);
  //   }

  //   if (W[0] != AdjGraphHeader) {
  //     cout << "Bad input file: missing header: " << AdjGraphHeader << endl;
  //     abort();
  //   }

  //   // num vertices, num edges, edge offsets, edge pointers
  //   long len = W.size() -1;
  //   long n = atol(W[1]);
  //   long m = atol(W[2]);
  //   if (len != n + m + 2) {
  //     cout << "Bad input file: length = "<<len<< " n+m+2 = " << n+m+2 << endl;
  //     abort();
  //   }
  //   sequence<intV> offsets(n+1, [&] (size_t i) {
  // 	return (i == n) ? m : atol(W[i+3]);});
  //   sequence<intV> edges(m, [&] (size_t i) {
  // 	return atol(W[n+i+3]);});

  //   return graphC<intV,intV>(offsets,edges,n,m);
  // }

  template <class intV, class Weight, class intE>
  wghGraph<intV, Weight, intE> readWghGraphFromFile(char* fname) {
    parlay::sequence<char> S = readStringFromFile(fname);
    parlay::sequence<char*> W = stringToWords(S);
    if (W[0] != WghAdjGraphHeader) {
      cout << "Bad input file" << endl;
      abort();
    }

    long n = atol(W[1]);
    long m = atol(W[2]);
    if (W.size() != n + 2*m + 3) {
      cout << "Bad input file: length = "<< W.size()
	   << " n + 2*m + 3 = " << n+2*m+3 << endl;
      abort(); }
    
    // tags on m at the end (so n+1 total offsets)
    auto offsets = parlay::tabulate(n+1, [&] (size_t i) -> intE {
	return (i == n) ? m : atol(W[i+3]);});
    auto edges = parlay::tabulate(m, [&] (size_t i) -> intV {
	return atol(W[n+i+3]);});
    auto weights = parlay::tabulate(m, [&] (size_t i) -> Weight {
	return (Weight) atof(W[n+i+3+m]);});

    return wghGraph<intV,Weight,intE>(std::move(offsets),
				      std::move(edges),
				      std::move(weights), n);
  }

  // The following two are used by the graph generators to write out in either format
  // and either with reordering or not
  template <class intV, class intE>
  void writeGraphFromAdj(graph<intV,intE> const &G,
			 char* fname, bool adjArray, bool ordered) {
    if (adjArray)
      if (ordered) writeGraphToFile(G, fname);
      else writeGraphToFile(graphReorder(G), fname);
    else {
      if (ordered)
	writeEdgeArrayToFile(edgesFromGraph(G), fname);
      else {
	auto B = edgesFromGraph(graphReorder(G));
	B = randomShuffle(B);
	writeEdgeArrayToFile(B, fname);
      }
    }
  }

  template <class intV, class intE=intV>
  void writeGraphFromEdges(edgeArray<intV> &EA, char* fname, bool adjArray, bool ordered) {
    writeGraphFromAdj(graphFromEdges<intV,intE>(EA, adjArray),
		      fname, adjArray, ordered);
  }

  // void errorOut(const char* s) {
  //   cerr << s << endl;
  //   throw s;
  // }

  // void packInt64(int64_t x, uint8_t buf[8]) {
  //   uint64_t xu = x;
  //   for (int i = 0; i < 8; ++i)
  //     buf[i] = (xu >> (8 * i)) & 0xff;
  // }
  // int64_t unpackInt64(const uint8_t buf[8]) {
  //   uint64_t xu = 0;
  //   for (int i = 0; i < 8; ++i)
  //     xu |= ((uint64_t)buf[i]) << (i * 8);
  //   return (int64_t)xu;
  // }

  // void writeInt(ostream& out, char buf[8], int64_t x) {
  //   packInt64(x, (uint8_t*)buf);
  //   out.write(buf, 8);
  // }
  // int64_t readInt(istream& in, char buf[8]) {
  //   in.read(buf, 8);
  //   return unpackInt64((uint8_t*)buf);
  // }

  // template<typename intV>
  // void writeFlowGraph(ostream& out, FlowGraph<intV> g) {
  //   char buf[8];
  //   out.write("FLOWFLOW", 8);
  //   writeInt(out, buf, g.g.n);
  //   writeInt(out, buf, g.g.m);
  //   writeInt(out, buf, g.source);
  //   writeInt(out, buf, g.sink);
  //   intV o = 0;
  //   for (intV i = 0; i < g.g.n; ++i) {
  //     writeInt(out, buf, o);
  //     o += g.g.V[i].degree;
  //   }
  //   for (intV i = 0; i < g.g.n; ++i) {
  //     wghVertex<intV>& v = g.g.V[i];
  //     for (intV j = 0; j < v.degree; ++j) {
  //       writeInt(out, buf, v.Neighbors[j]);
  //       writeInt(out, buf, v.nghWeights[j]);
  //     }
  //   }
  // }
  // template<typename intV>
  // FlowGraph<intV> readFlowGraph(istream& in) {
  //   char buf[10];
  //   in.read(buf, 8);
  //   buf[8] = 0;
  //   if (strcmp(buf, "FLOWFLOW"))
  //     errorOut("Invalid flow graph input file");
  //   intV n = readInt(in, buf);
  //   intV m = readInt(in, buf);
  //   intV S = readInt(in, buf);
  //   intV T = readInt(in, buf);
  //   intV *offset = newA(intV, n);
  //   intV* adj = newA(intV, m);
  //   intV* weights = newA(intV, m);
  //   wghVertex<intV>* v = newA(wghVertex<intV>, n);
  //   for (intV i = 0; i < n; ++i) {
  //     offset[i] = readInt(in, buf);
  //     v[i].Neighbors = adj + offset[i];
  //     v[i].nghWeights = weights + offset[i];
  //     if (i > 0)
  //       v[i - 1].degree = offset[i] - offset[i - 1];
  //   }
  //   v[n - 1].degree = m - offset[n - 1];
  //   free(offset);
  //   for (intV i = 0; i < m; ++i) {
  //     adj[i] = readInt(in, buf);
  //     weights[i] = readInt(in, buf);
  //   }
  //   return FlowGraph<intV>(wghGraph<intV>(v, n, m, adj, weights), S, T);
  // }

  // const char nl = '\n';
  // template <typename intV>
  // FlowGraph<intV> writeFlowGraphDimacs(ostream& out, FlowGraph<intV> g) {
  //   out << "c DIMACS flow network description" << nl;
  //   out << "c (problem-id, nodes, arcs)" << nl;
  //   out << "p max " << g.g.n << " " << g.g.m << nl;

  //   out << "c source" << nl;
  //   out << "n " << g.source + 1 << " s" << nl;
  //   out << "c sink" << nl;
  //   out << "n " << g.sink + 1 << " t" << nl;

  //   out << "c arc description (from, to, capacity)" << nl;

  //   for (intV i = 0; i < g.g.n; ++i) {
  //     wghVertex<intV>& v = g.g.V[i];
  //     for (intV j = 0; j < v.degree; ++j) {
  //       out << "a " << i + 1 << " " << v.Neighbors[j] + 1 << " "
  //           << v.nghWeights[j] << nl;
  //     }
  //   }
  // }

  // template<typename intV>
  // struct intWghEdge {
  //   intV from, to, w;
  // };
  // int readDimacsLinePref(istream& in, const char* expected) {
  //   char type;
  //   while (in >> type) {
  //     if (type == 'c') {
  //       while (in.peek() != EOF && in.peek() != '\n')
  //         in.ignore();
  //       in >> ws;
  //       continue;
  //     } else if (!strchr(expected, type)) {
  //       errorOut((string("Unexpected DIMACS line (expected 'c' or one of '")
  // 		  + expected + "')").c_str());
  //     }
  //     return type;
  //   }
  //   return EOF;
  // }

  // template <typename intV>
  // FlowGraph<intV> readFlowGraphDimacs(istream& in) {
  //   string tmp;
  //   intV n, m;
  //   int type = readDimacsLinePref(in, "p");
  //   if (type == EOF)
  //     errorOut("Unexpected EOF while reading DIMACS file");
  //   in >> tmp >> n >> m;
  //   intWghEdge<intV>* edges = newA(intWghEdge<intV>, m);
  //   intV edgei = 0;
  //   intV* pos = newA(intV, n + 1);
  //   intV S = -1, T = -1;
  //   while (EOF != (type = readDimacsLinePref(in, "an"))) {
  //     if (type == 'n') {
  //       intV x;
  //       char st;
  //       in >> x >> st;
  //       x--;
  //       if (st == 's') S = x;
  //       else T = x;
  //     } else { // type == 'a'
  //       intV from, to, cap;
  //       in >> from >> to >> cap;
  //       from--; to--;
  //       edges[edgei] = (intWghEdge<intV>) { from, to, cap };
  //       edgei++;
  //       pos[from + 1]++;
  //     }
  //   }
  //   if (S < 0)
  //     errorOut("No source was specified in DIMACS input file");
  //   if (T < 0)
  //     errorOut("No sink was specified in DIMACS input file");
  //   if (m != edgei)
  //     errorOut("Inconsistent edge count in DIMACS input file");
  //   intV* adj = newA(intV, m);
  //   intV* weights = newA(intV, m);
  //   wghVertex<intV>* v = newA(wghVertex<intV>, n);
  //   for (intV i = 0; i < n; ++i) {
  //     pos[i + 1] += pos[i];
  //     v[i].Neighbors = adj + pos[i];
  //     v[i].nghWeights = weights + pos[i];
  //     v[i].degree = pos[i + 1] - pos[i];
  //   }
  //   for (intV i = 0; i < m; ++i) {
  //     intV& p = pos[edges[i].from];
  //     adj[p] = edges[i].to;
  //     weights[p] = edges[i].w;
  //     p++;
  //   }
  //   free(edges);
  //   free(pos);
  //   return FlowGraph<intV>(wghGraph<intV>(v, n, m, adj, weights), S, T);
  // }
};


================================================
FILE: algorithms/bench/common/graphUtils.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <iostream>
#include <fstream>
#include <cstdlib>
#include <math.h>
#include "graph.h"
#include "../parlay/parallel.h"
#include "../parlay/primitives.h"
#include "../parlay/random.h"
#include "dataGen.h"

using namespace std;

template <class intV, class Weight = DefaultWeight>
wghEdgeArray<intV,Weight> addRandWeights(edgeArray<intV> const &G) {
  using WE = wghEdge<intV,Weight>;
  parlay::random r(257621);
  intV m = G.nonZeros;
  intV n = G.numRows;
  auto E = parlay::tabulate(m, [&] (size_t i) -> WE {
      return WE(G.E[i].u, G.E[i].v, (Weight) dataGen::hash<Weight>(i));});
  return wghEdgeArray<intV,Weight>(std::move(E), n);
}

template <class intV>
edgeArray<intV> randomShuffle(edgeArray<intV> const &A) {
  auto E =  parlay::random_shuffle(A.E);
  return edgeArray<intV>(std::move(E), A.numRows, A.numCols);
}

template <class intV>
edgeArray<intV> remDuplicates(edgeArray<intV> const &A) {
  auto lessE = [&] (edge<intV> a, edge<intV> b) {
    return (a.u < b.u) || ((a.u == b.u) && (a.v < b.v));};
  parlay::sequence<edge<intV>> E =
    parlay::remove_duplicates_ordered(A.E, lessE);
  return edgeArray<intV>(std::move(E), A.numRows, A.numCols);
}

template <class intV>
edgeArray<intV> makeSymmetric(edgeArray<intV> const &A) {
  parlay::sequence<edge<intV>> EF = parlay::filter(A.E, [&] (edge<intV> e) {
      return e.u != e.v;});
  auto FE = parlay::delayed_seq<edge<intV>>(EF.size(), [&] (size_t i) {
      return edge<intV>(EF[i].v, EF[i].u);});
  return remDuplicates(edgeArray<intV>(parlay::append(EF, FE),
				       A.numRows, A.numCols));
}

template <class intV, class intE = intV>
graph<intV,intE> graphFromEdges(edgeArray<intV> const &EA, bool makeSym) {
  edgeArray<intV> SA;
  if (makeSym) SA = makeSymmetric<intV>(EA);
  edgeArray<intV> const &A = (makeSym) ? SA : EA;

  size_t m = A.nonZeros;
  size_t n = std::max(A.numCols, A.numRows);

  parlay::sequence<size_t> counts;
  parlay::sequence<intE> offsets;
  parlay::sequence<edge<intV>> E;
  size_t nn;
  auto getu = [&] (edge<intV> e) {return e.u;};
  std::tie(E, counts) = parlay::internal::integer_sort_with_counts(parlay::make_slice(A.E), getu, n);
  std::tie(offsets,nn) = parlay::scan(parlay::delayed_seq<intE>(n+1, [&] (size_t i) {
	return (i == n) ? 0 : counts[i];}), parlay::addm<intE>());

  return graph<intV,intE>(std::move(offsets),
			  parlay::tabulate(m, [&] (size_t i) -> intV {return E[i].v;}),
			  n);
}

template <class intV, class Weight, class intE=intV>
wghGraph<intV,Weight,intE>
wghGraphFromEdges(wghEdgeArray<intV,Weight> const &A) {
  using WE = wghEdge<intV,Weight>;
  size_t n = A.n;
  size_t m = A.m;

  parlay::sequence<size_t> counts;
  parlay::sequence<intE> offsets;
  parlay::sequence<WE> E;
  size_t nn;
  auto getu = [&] (WE e) {return e.u;};
  std::tie(E, counts) = parlay::internal::integer_sort_with_counts(parlay::make_slice(A.E), getu, n);
  std::tie(offsets,nn) = parlay::scan(parlay::delayed_seq<intE>(n+1, [&] (size_t i) {
	return (i == n) ? 0 : counts[i];}), parlay::addm<intE>());

  return wghGraph<intV,Weight,intE>(std::move(offsets),
				    parlay::tabulate(m, [&] (size_t i)->intV {return E[i].v;}),
				    parlay::tabulate(m, [&] (size_t i) -> Weight {
					return E[i].weight;}),
				    n);
}

template <class intV, class intE>
edgeArray<intV> edgesFromGraph(graph<intV,intE> const &G) {
  size_t numRows = G.numVertices();
  size_t nonZeros = G.numEdges();

  // flatten
  parlay::sequence<edge<intV>> E(nonZeros);
  parlay::parallel_for(0, numRows, [&] (size_t j) {
      size_t off = G.get_offsets()[j];
      vertex<intV> v = G[j];
      for (size_t i = 0; i < v.degree; i++)
	E[off+i] = edge<intV>(j, v.Neighbors[i]);
    });
  return edgeArray<intV>(std::move(E), numRows, numRows);
}

// offset for start of each vertex if flattening the edge listd
template <class intV, class intE, class Vtx>
parlay::sequence<intE> getOffsets(parlay::sequence<Vtx> const &V) {
  size_t n = V.size();
  auto degrees = parlay::delayed_seq<intE>(n+1, [&] (size_t i) -> intE {
      return (i == n) ? 0 : V[i].degree;});
  auto x = parlay::scan(degrees, parlay::addm<intE>());
  return x.first;
}

// packs a graph so that there are no gaps in the edge array (i.e. into CSR)
template <class intV, class intE>
graph<intV,intE> packGraph(graph<intV,intE> const &G) {
  size_t n = G.numVertices();
  auto degrees = parlay::delayed_seq<intE>(n+1, [&] (size_t i) -> intE {
						  return (i == n) ? 0 : G[i].degree;});
  // calculate new offsets
  auto sr = parlay::scan(degrees, parlay::addm<intE>());
  // allocate new edge array
  parlay::sequence<intV> outEdges(sr.second);
  // copy edges so they are contiguous
  parlay::parallel_for (0, G.n, [&] (size_t i) {
      vertex<intV> v = G[i];
      size_t offset = sr.first[i];
      for (size_t j=0; j < v.degree; j++)
	outEdges[offset + j] = v.Neighbors[j];
    });
  return graph<intV,intE>(std::move(sr.first), std::move(outEdges), n);
}

// if I is NULL then it randomly reorders
template <class intV, class intE>
graph<intV,intE> graphReorder(graph<intV,intE> const &Gr,
			      parlay::sequence<intV> const &I = parlay::sequence<intV>(0)) {
  intV n = Gr.numVertices();
  intV m = Gr.numEdges();

  bool noI = (I.size()==0);
  parlay::sequence<intV> const &II = noI ? parlay::random_permutation<intV>(n) : I;

  // now write vertices to new locations
  // inverse permutation
  parlay::sequence<vertex<intV>> V(n);
  parlay::parallel_for (0, n, [&] (size_t i) {
      V[II[i]] = Gr[i];});
  parlay::sequence<intE> offsets = getOffsets<intV,intE>(V);
  parlay::sequence<intV> E(m);
  parlay::parallel_for (0, n, [&] (size_t i) {
      size_t o = offsets[i];
      for (size_t j=0; j < V[i].degree; j++) 
	E[o + j] = II[V[i].Neighbors[j]];
      std::sort(E.begin() + o, E.begin() + o + V[i].degree);
    }, 1000);
  return graph<intV>(std::move(offsets), std::move(E), n);
}

template <class intV, class intE>
int graphCheckConsistency(graph<intV,intE> const &Gr) {
  size_t n = Gr.numVertices();
  size_t m = Gr.numEdges();
  size_t edgecount = parlay::reduce(parlay::delayed_seq<size_t>(n, [&] (size_t i) {
	return Gr[i].degree;}), parlay::addm<size_t>());
  if (m != edgecount) {
    cout << "bad edge count in graphCheckConsistency: m = " 
	 << m << " sum of degrees = " << edgecount << endl;
    return 1;
  }
  size_t error_loc = parlay::reduce(parlay::delayed_seq<size_t>(n, [&] (size_t i) {
	for (size_t j=0; j < Gr[i].degree; j++) 
	  if (Gr[i].Neighbors[j] >= n) return i;
	return n;
      }), parlay::minm<size_t>());
  if (error_loc < n) {
    cout << "edge out of range in graphCheckConsistency: at i = " 
	 << error_loc << endl;
    return 1;
  }
}

// template <class intV>
// sparseRowMajor<double,intV> sparseFromCsrFile(const char* fname) {
//   FILE *f = fopen(fname,"r");
//   if (f == NULL) {
//     cout << "Trying to open nonexistant file: " << fname << endl;
//     abort();
//   }

//   intV numRows;  intV numCols;  intV nonZeros;
//   intV nc = fread(&numRows, sizeof(intV), 1, f);
//   nc = fread(&numCols, sizeof(intV), 1, f);
//   nc = fread(&nonZeros, sizeof(intV), 1, f); 

//   double *Values = (double *) malloc(sizeof(double)*nonZeros);
//   intV *ColIds = (intV *) malloc(sizeof(intV)*nonZeros);
//   intV *Starts = (intV *) malloc(sizeof(intV)*(1 + numRows));
//   Starts[numRows] = nonZeros;

//   size_t r;
//   r = fread(Values, sizeof(double), nonZeros, f);
//   r = fread(ColIds, sizeof(intV), nonZeros, f);
//   r = fread(Starts, sizeof(intV), numRows, f); 
//   fclose(f);
//   return sparseRowMajor<double,intV>(numRows,numCols,nonZeros,Starts,ColIds,Values);
// }

// template <class intV>
// edgeArray<intV> edgesFromMtxFile(const char* fname) {
//   ifstream file (fname, ios::in);
//   char* line = newA(char,1000);
//   intV i,j = 0;
//   while (file.peek() == '%') {
//     j++;
//     file.getline(line,1000);
//   }
//   intV numRows, numCols, nonZeros;
//   file >> numRows >> numCols >> nonZeros;
//   //cout << j << "," << numRows << "," << numCols << "," << nonZeros << endl;
//   edge<intV> *E = newA(edge<intV>,nonZeros);
//   double toss;
//   for (i=0, j=0; i < nonZeros; i++) {
//     file >> E[j].u >> E[j].v >> toss;
//     E[j].u--;
//     E[j].v--;
//     if (toss != 0.0) j++;
//   }
//   nonZeros = j;
//   //cout << "nonzeros = " << nonZeros << endl;
//   file.close();  
//   return edgeArray<intV>(E,numRows,numCols,nonZeros);
// }


================================================
FILE: algorithms/bench/common/ligraLight.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include <limits>
#include "parlay/primitives.h"
#include "parlay/parallel.h"
#include "parlay/internal/get_time.h"
#include "parlay/internal/block_delayed.h"
#include "common/graph.h"

namespace delayed = parlay::block_delayed;

namespace ligra {
  
template<typename vertexId>
struct vertex_subset {
  using sparse_t = parlay::sequence<vertexId>;
  using dense_t = parlay::sequence<bool>;
  bool is_sparse;
  size_t n;
  size_t size() const {return n;}
  sparse_t sparse;
  dense_t dense;
  vertex_subset(sparse_t x) :
    sparse(std::move(x)), is_sparse(true), n(x.size()) {}
  vertex_subset(vertexId v) :
    sparse(sparse_t(1,v)), is_sparse(true), n(1) {}
  vertex_subset(dense_t x) :
    dense(std::move(x)), is_sparse(false),
    n(parlay::count(x,true)) {}
};

template<typename Graph, typename Fa, typename Cond> 
struct edge_map {
  using vertexId = typename Graph::vertexId;
  using vertex_subset_ = vertex_subset<vertexId>;
  using vertex_subset_sparse = parlay::sequence<vertexId>;
  using vertex_subset_dense = parlay::sequence<bool>;
  Fa fa;
  Cond cond;
  const Graph& G;
  bool dedup;
  bool verbose;
  parlay::sequence<vertexId> dup_seq;
  edge_map(Graph const &G, Fa fa, Cond cond, bool dedup=false,
	   bool verbose=false) :
    G(G), fa(fa), cond(cond), dedup(dedup), verbose(verbose) {
    dup_seq = parlay::sequence<vertexId>::uninitialized(G.numVertices());
  }

  auto edge_map_sparse(vertex_subset_sparse const &vtx_subset) {
    if (verbose) std::cout << "edge map sparse: " << vtx_subset.size() << std::endl;
    auto nested_edges = parlay::map(vtx_subset, [&] (vertexId v) {
	return parlay::delayed_tabulate(G[v].degree, [&, v] (size_t i) {
	    return std::pair(v, G[v].Neighbors[i]);});});
    auto edges = delayed::flatten(nested_edges);
    auto r = delayed::filter_map(edges,
				 [&] (auto x) {return cond(x.second) && fa(x.first, x.second);},
				 [] (auto x)  {return x.second;});
    if (dedup) {
      parlay::parallel_for(0,r.size(), [&] (size_t i) { dup_seq[r[i]] = i;});
      auto flags = parlay::tabulate(r.size(), [&] (size_t i) {return i==dup_seq[r[i]];});
      return vertex_subset_(parlay::pack(r, flags));
    }
    return vertex_subset_(std::move(r));
  }

  auto edge_map_dense(vertex_subset_dense const &vtx_subset) {
    if (verbose) std::cout << "edge map dense:  " << vtx_subset.size() << std::endl;
    auto r = parlay::tabulate(G.numVertices(), [&] (vertexId v) -> bool {
        bool result = false;
        if (cond(v)) {        
	  size_t block_size = 5000;
	  auto vtx = G[v];
	  auto d = vtx.degree;
	  auto ngh = vtx.Neighbors;
	  auto do_block = [&, vsub=vtx_subset.begin()] (size_t i) {
            size_t begin = block_size * i;
	    size_t end = std::min<size_t>(begin + block_size, d);
	    for (size_t j = begin; j < end; j++) {
	      if (!cond(v)) return;
	      vertexId u = ngh[j];
	      if (vsub[u]) {
		bool x = fa(u,v);
		if (!result && x) result = true;
	      }}};
	  size_t num_blocks = vtx.degree/block_size + 1;
	  if (num_blocks == 1) do_block(0);
	  else parlay::parallel_for(0, num_blocks, do_block, 1);
	}
	return result;});
    return vertex_subset_(std::move(r));
  }

  auto operator() (vertex_subset_ const &vtx_subset) {
    parlay::internal::timer t("edge_map", verbose);
    auto l = vtx_subset.size();
    auto n = G.numVertices();
    bool do_dense;
    if (vtx_subset.is_sparse) {
      auto out_degree = parlay::reduce(parlay::delayed_map(vtx_subset.sparse, [&] (size_t i) {
			   return G[i].degree;}));
      if ((l + out_degree) > G.m/20) {
	parlay::sequence<bool> d_vtx_subset(n, false);
	parlay::parallel_for(0, l, [&] (size_t i) {
          d_vtx_subset[vtx_subset.sparse[i]] = true;});
	t.next("convert");
	return edge_map_dense(d_vtx_subset);
      } else return edge_map_sparse(vtx_subset.sparse);
    } else {
      if (l > n/20) return edge_map_dense(vtx_subset.dense);
      else {
	auto s_vtx_subset = parlay::pack_index<vertexId>(vtx_subset.dense);
	return edge_map_sparse(s_vtx_subset);
      }
    }
  }
};
}


================================================
FILE: algorithms/bench/common/parallelDefs
================================================
ifeq (, $(shell which jemalloc-config))
JEMALLOC =
else
JEMALLOCLD = $(shell jemalloc-config --libdir)
JEMALLOC = -L$(JEMALLOCLD) -ljemalloc 
endif

CCFLAGS = -mcx16 -O3 -std=c++17 -DNDEBUG -I .
CLFLAGS = -ldl $(JEMALLOC)

OMPFLAGS = -DPARLAY_OPENMP -fopenmp
CILKFLAGS = -DPARLAY_CILK -fcilkplus
PBBFLAGS = -DHOMEGROWN -pthread

ifdef OPENMP
CC = g++
CFLAGS = $(OMPFLAGS) $(CCFLAGS)
LFLAGS = $(OMPFLAGS) $(CLFLAGS)

else ifdef CILK
CC = g++
CFLAGS = $(CILKFLAGS) $(CCFLAGS)
LFLAGS = $(CILKFLAGS) $(CLFLAGS)

else
CC = g++
CFLAGS = $(PBBFLAGS) $(CCFLAGS)
LFLAGS = $(PBBFLAGS) $(CLFLAGS)
endif


================================================
FILE: algorithms/bench/common/parallelDefsANN
================================================
ifeq (, $(shell which jemalloc-config))
JEMALLOC =
else
JEMALLOCLD = $(shell jemalloc-config --libdir)
JEMALLOC = -L$(JEMALLOCLD) -ljemalloc 
endif

CCFLAGS = -mcx16 -O3 -std=c++17 -march=native -DNDEBUG -I .
CLFLAGS = -ldl $(JEMALLOC)

OMPFLAGS = -DPARLAY_OPENMP -fopenmp
CILKFLAGS = -DPARLAY_CILK -fcilkplus
PBBFLAGS = -DHOMEGROWN -pthread

ifdef OPENMP
CC = g++
CFLAGS = $(OMPFLAGS) $(CCFLAGS)
LFLAGS = $(OMPFLAGS) $(CLFLAGS)

else ifdef CILK
CC = g++
CFLAGS = $(CILKFLAGS) $(CCFLAGS)
LFLAGS = $(CILKFLAGS) $(CLFLAGS)

else
CC = g++
CFLAGS = $(PBBFLAGS) $(CCFLAGS)
LFLAGS = $(PBBFLAGS) $(CLFLAGS)
endif


================================================
FILE: algorithms/bench/common/parallelDefs_OMP
================================================
ifeq (, $(shell which jemalloc-config))
JEMALLOC =
else
JEMALLOCLD = $(shell jemalloc-config --libdir)
JEMALLOC = -L$(JEMALLOCLD) -ljemalloc 
endif

CCFLAGS = -mcx16 -O3 -std=c++17
CLFLAGS = -ldl $(JEMALLOC)

OMPFLAGS = -DOPENMP -fopenmp
CILKFLAGS = -DCILK -fcilkplus
PBBFLAGS = -DPARLAY_OPENMP -fopenmp -pthread

ifdef OPENMP
CC = g++
CFLAGS = $(OMPFLAGS) $(CCFLAGS)
LFLAGS = $(OMPFLAGS) $(LCFLAGS)

else ifdef CILK
CC = g++
CFLAGS = $(CILKFLAGS) $(CCFLAGS)
LFLAGS = $(CILKFLAGS) $(CLFLAGS)

else
CC = g++
CFLAGS = $(PBBFLAGS) $(CCFLAGS)
LFLAGS = $(PBBFLAGS) $(CLFLAGS)
endif


================================================
FILE: algorithms/bench/common/parseCommandLine.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#ifndef _PARSE_COMMAND_LINE
#define _PARSE_COMMAND_LINE

#include <iostream>
#include <fstream>
#include <string>
#include <cstring>
using namespace std;

struct commandLine {
  int argc;
  char** argv;
  string comLine;
  commandLine(int _c, char** _v, string _cl) 
    : argc(_c), argv(_v), comLine(_cl) {}

  commandLine(int _c, char** _v) 
    : argc(_c), argv(_v), comLine("bad arguments") {}

  void badArgument() {
    cout << "usage: " << argv[0] << " " << comLine << endl;
    abort();
  }

  // get an argument
  // i is indexed from the last argument = 0, second to last indexed 1, ..
  char* getArgument(int i) {
    if (argc < 2+i) badArgument();
    return argv[argc-1-i];
  }

  // looks for two filenames
  pair<char*,char*> IOFileNames() {
    if (argc < 3) badArgument();
    return pair<char*,char*>(argv[argc-2],argv[argc-1]);
  }

  pair<int,char*> sizeAndFileName() {
    if (argc < 3) badArgument();
    return pair<int,char*>(std::atoi(argv[argc-2]),(char*) argv[argc-1]);
  }

  bool getOption(string option) {
    for (int i = 1; i < argc; i++)
      if ((string) argv[i] == option) return true;
    return false;
  }

  char* getOptionValue(string option) {
    for (int i = 1; i < argc-1; i++)
      if ((string) argv[i] == option) return argv[i+1];
    return NULL;
  }

  string getOptionValue(string option, string defaultValue) {
    for (int i = 1; i < argc-1; i++)
      if ((string) argv[i] == option) return (string) argv[i+1];
    return defaultValue;
  }

  int getOptionIntValue(string option, int defaultValue) {
    for (int i = 1; i < argc-1; i++)
      if ((string) argv[i] == option) {
	int r = atoi(argv[i+1]);
	if (r < 1) badArgument();
	return r;
      }
    return defaultValue;
  }

  long getOptionLongValue(string option, long defaultValue) {
    for (int i = 1; i < argc-1; i++)
      if ((string) argv[i] == option) {
	long r = atol(argv[i+1]);
	if (r < 1) badArgument();
	return r;
      }
    return defaultValue;
  }

  double getOptionDoubleValue(string option, double defaultValue) {
    for (int i = 1; i < argc-1; i++)
      if ((string) argv[i] == option) {
	double val;
	if (sscanf(argv[i+1], "%lf",  &val) == EOF) {
	  badArgument();
	}
	return val;
      }
    return defaultValue;
  }

};
 
#endif // _PARSE_COMMAND_LINE


================================================
FILE: algorithms/bench/common/parse_command_line.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <iostream>
#include <fstream>
#include <string>
#include <cstring>

struct commandLine {
  int argc;
  char** argv;
  std::string comLine;
  commandLine(int _c, char** _v, std::string _cl)
    : argc(_c), argv(_v), comLine(_cl) {
      if (getOption("-h") || getOption("-help"))
	badArgument();
    }

  commandLine(int _c, char** _v)
    : argc(_c), argv(_v), comLine("bad arguments") { }

  void badArgument() {
    std::cout << "usage: " << argv[0] << " " << comLine << std::endl;
    exit(0);
  }

  // get an argument
  // i is indexed from the last argument = 0, second to last indexed 1, ..
  char* getArgument(int i) {
    if (argc < 2+i) badArgument();
    return argv[argc-1-i];
  }

  // looks for two filenames
  std::pair<char*,char*> IOFileNames() {
    if (argc < 3) badArgument();
    return std::pair<char*,char*>(argv[argc-2],argv[argc-1]);
  }

  std::pair<size_t,char*> sizeAndFileName() {
    if (argc < 3) badArgument();
    return std::pair<size_t,char*>(std::atoi(argv[argc-2]),(char*) argv[argc-1]);
  }

  bool getOption(std::string option) {
    for (int i = 1; i < argc; i++)
      if ((std::string) argv[i] == option) return true;
    return false;
  }

  char* getOptionValue(std::string option) {
    for (int i = 1; i < argc-1; i++)
      if ((std::string) argv[i] == option) return argv[i+1];
    return NULL;
  }

  std::string getOptionValue(std::string option, std::string defaultValue) {
    for (int i = 1; i < argc-1; i++)
      if ((std::string) argv[i] == option) return (std::string) argv[i+1];
    return defaultValue;
  }

  long getOptionLongValue(std::string option, long defaultValue) {
    for (int i = 1; i < argc-1; i++)
      if ((std::string) argv[i] == option) {
	long r = atol(argv[i+1]);
	if (r < 0) badArgument();
	return r;
      }
    return defaultValue;
  }

  int getOptionIntValue(std::string option, int defaultValue) {
    for (int i = 1; i < argc-1; i++)
      if ((std::string) argv[i] == option) {
	int r = atoi(argv[i+1]);
	if (r < 0) badArgument();
	return r;
      }
    return defaultValue;
  }

  double getOptionDoubleValue(std::string option, double defaultValue) {
    for (int i = 1; i < argc-1; i++)
      if ((std::string) argv[i] == option) {
	double val;
	if (sscanf(argv[i+1], "%lf",  &val) == EOF) {
	  badArgument();
	}
	return val;
      }
    return defaultValue;
  }

};


================================================
FILE: algorithms/bench/common/runTests.py
================================================
import subprocess
import sys
import random
import os

def onPprocessors(command,p) :
  if "OPENMP" in os.environ:
    return "OMP_NUM_THREADS="+repr(p)+" " + command
    return command  
  elif "CILK" in os.environ:
    return "CILK_NWORKERS="+repr(p)+" " + command
  else:
    return "PARLAY_NUM_THREADS="+repr(p)+" " + command
  
def shellGetOutput(str) :
  process = subprocess.Popen(str,shell=True,stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
  output, err = process.communicate()
  
  if (len(err) > 0):
      raise NameError(str+"\n"+output+err)
  return output.decode("utf-8")

def stripFloat(val) :
  trunc = float(int(val*1000))/1000
  return str(trunc).rstrip('0')    

def runSingle(runProgram, options, ifile, procs) :
  comString = "./"+runProgram+" "+options+" "+ifile
  if (procs > 0) :
    comString = onPprocessors(comString,procs)
  out = shellGetOutput(comString)
  #print(out)
  try:
    times = [float(str[str.index(':')+2:]) for str in out.split('\n') if str.startswith("Parlay time: ")]
    return times
  except (ValueError,IndexError):
    raise NameError(comString+"\n"+out)

def geomean(a) :
  r = 1.0
  for x in a :
    r = r * x
  return r**(1.0/len(a))

def runTest(runProgram, checkProgram, dataDir, test, rounds, procs, noOutput, keepData) :
    random.seed()
    outFile="/tmp/ofile%d_%d" %(random.randint(0, 1000000), random.randint(0, 1000000)) 
    [weight, inputFileNames, runOptions, checkOptions] = test
    if type(inputFileNames) is str :
      inputFileNames = [inputFileNames]
    shortInputNames = " ".join(inputFileNames)
    if len(dataDir)>0:
      out = shellGetOutput("cd " + dataDir + "; make " + shortInputNames)
    longInputNames = " ".join(dataDir + "/" + name for name in inputFileNames)
    runOptions = runOptions + " -r " + repr(rounds)
    if (noOutput == 0) :
      runOptions = runOptions + " -o " + outFile
    times = runSingle(runProgram, runOptions, longInputNames, procs)
    if (noOutput == 0) :
      checkString = ("./" + checkProgram + " " + checkOptions + " "
                     + longInputNames + " " + outFile)
      checkOut = shellGetOutput(checkString)
      # Allow checker output comments. Comments are lines prefixed by '::'
      nonCommentLines = [s for s in checkOut.split('\n') if not s.startswith(':') and len(s)>0]
      if (len(nonCommentLines) > 0) :
        print("CheckOut:", checkOut)
        raise NameError(checkString+"\n"+checkOut)
      os.remove(outFile)
    if len(dataDir)>0 and not(keepData):
      out = shellGetOutput("rm " + longInputNames)
    ptimes = str([stripFloat(time)
                  for time in times])[1:-1]
    outputStr = ""
    if (len(runOptions) > 0) :
      outputStr = " : " + runOptions
    print(shortInputNames + outputStr + " : "
          + ptimes + ", geomean = " + stripFloat(geomean(times)))
    return [weight,times]
    
def averageTime(times) :
    return sum(times)/len(times)
    
def timeAll(name, runProgram, checkProgram, dataDir, tests, rounds, procs, noOutput,
            addToDatabase, problem, keepData) :
  totalTime = 0
  totalWeight = 0
  try:
    results = [runTest(runProgram, checkProgram, dataDir, test, rounds, procs,
                       noOutput, keepData)
               for test in tests]
    meanOfMeans = geomean([geomean(times) for (w,times) in results])
    meanOfMins = geomean([sorted(times)[0] for (w,times) in results])
    print(name + " : " + repr(procs) +" : " +
          "geomean of mins = " + stripFloat(meanOfMins) +
          ", geomean of geomeans = " + stripFloat(meanOfMeans))
    if (addToDatabase) :
      try:
        dbAddResult(problem=problem, program=runProgram, results=results, numProcs=procs, mean=totalTimeMean/totalWeight,
                    min=totalTimeMin/totalWeight, median=totalTimeMedian/totalWeight, tests=tests)
      except:
        print("Could not insert result in database. Error:", sys.exc_info()[0])
#        if (os.getlogin() == 'akyrola'):  raise
    return 0
  except NameError as x:
    print("TEST TERMINATED ABNORMALLY:\n["+str(x) + "]")
    return 1
  except KeyboardInterrupt:
    return 1


def getOption(str) :
  a = sys.argv
  l = len(a)
  for i in range(1, l) :
    if (a[i] == str) :
      return True
  return False

def getArg(str, default) :
  a = sys.argv
  l = len(a)
  for i in range(1, l) :
    if (a[i] == str and  (i+1 != l)) :
        return sys.argv[i+1]
  return default

def getArgs() :
  noOutput = getOption("-x")
  addToDatabase = getOption("-d")
  processors = int(getArg("-p", 0))
  rounds = int(getArg("-r", 1))
  keep = getOption("-k")
  return (noOutput, rounds, addToDatabase, processors, keep)

def timeAllArgs(runProgram, problem, checkProgram, dataDir, tests, keepInputData=False) :
  keepData = keepInputData
  (noOutput, rounds, addToDatabase, procs, keep) = getArgs()
  keep = keepInputData or keep
  name = os.path.basename(os.getcwd())
  timeAll(name, runProgram, checkProgram, dataDir, tests, rounds, procs, noOutput, addToDatabase, problem, keep)

#
# Database insertions
# - akyrola@cs.cmu.edu

import os

def dbInitConnection():
    import MySQLdb
    global cursor
    # TODO: move to a config file
    dbconn = MySQLdb.connect (host = "multi6.aladdin.cs.cmu.edu",
                                                            user = "pbbs",
                                                            passwd = "pbbspasshuuhaa",
                                                            db = "pbbsweb")

    cursor = dbconn.cursor ()
    dbconn.autocommit(1)


def dbAddResult(problem, program, results, numProcs, mean, min, median, tests):
    dbInitConnection()
    contentHash = computeContentHash(tests)
    program = shellGetOutput("pwd").split('/')[-1].replace('\r','').replace('\n', '') + '/' + program
    problemId = dbGetProblemId(problem, contentHash)
    programId = dbGetProgramId(program, problemId)
    hostId = getHostId()

       
    #username = os.getlogin()
    # getlogin does not work with some terminals (see various posts on web)
    # guyb replaced with the following
    username = os.getenv('USER')
    if (numProcs == 0): numProcs = detectCPUs()
    # Insert run into db
    cursor.execute(""" insert into pbbs_runs (problem_id,program_id,numprocs,mean_time,min_time,median_time,username,host_id) values(
                                                %s,      %s,          %s,      %s,       %s,       %s,       %s,      %s)
                       """, (problemId, programId, numProcs, mean, min, median, username, hostId))
    cursor.execute(" select last_insert_id()")
    runId = cursor.fetchone()[0]
    
    for i in range(0, len(results)):
        (weight, times) = results[i]
        test = tests[i]
        [weight,inputFileNames,runOptions,checkOptions] = test
        if type(inputFileNames) is list :
          inputFileNames = "+".join(inputFileNames)
        for time in times:
            cursor.execute(""" insert into pbbs_subruns(run_id, inputfile, time, weight, params, check_params) values(
                                                       %s,          %s      , %s ,   %s,       %s,     %s) """,
                                                        (runId, inputFileNames, time, weight, runOptions, checkOptions))
        
    
def computeContentHash(tests):
    hash = ""
    for test in tests:
        [weight,inputFileNames,runOptions,checkOptions] = test
        if type(inputFileNames) is list :
          inputFileNames = "+".join(inputFileNames)
        hash += ";%f%s%s%s" %(weight,inputFileNames.strip(), runOptions.strip(),checkOptions.strip())
    hash = hash.replace(' ', '_')
    return hash
    
def dbGetProblemId(probname, contentHash):
    cursor.execute("select id from pbbs_problems where name=%s and content_hash=%s", (probname, contentHash))
    row = cursor.fetchone()
    if row == None:
        # Insert into db
        cursor.execute( "insert into pbbs_problems (name,content_hash) values(%s,%s) ", (probname, contentHash))
        cursor.execute(" select last_insert_id()")
        row = cursor.fetchone()
    return row[0]
    
def dbGetProgramId(progname, problemId): 
    cursor.execute("select id from pbbs_programs where name=%s and problem_id=%s", (progname, problemId))
    row = cursor.fetchone()
    if row == None:
        # Insert into db
        cursor.execute( "insert into pbbs_programs (problem_id, name) values(%s, %s) ", (problemId, progname))
        cursor.execute(" select last_insert_id()")
        row = cursor.fetchone()
    return row[0]
    
import platform
def getHostId():
    (procmodel, mhz) = detectCPUModel() 
    numprocs = detectCPUs()
    
    (sysname, nodename, release, version, machine) = os.uname()
    
    if ("OPENMP" in os.environ):
       nodename = nodename + "[OPENMP]"
    
    cursor.execute("select id from pbbs_hosts where hostname=%s and procmodel=%s and version=%s and numprocs=%s", (nodename, procmodel, version, numprocs))
    row = cursor.fetchone()
    if row == None:
        cursor.execute(""" insert into pbbs_hosts(hostname,sysname,releasen,version,machine,numprocs,procmodel,mhz) values
                                                  (%s,      %s,        %s,   %s,    %s,    %s,           %s,  %s) """,
                                                    (nodename, sysname, release, version, machine, numprocs, procmodel, mhz))
        cursor.execute(" select last_insert_id()")
        row = cursor.fetchone()
    return row[0]

def detectCPUModel():  
    mhz = 0
    model = platform.processor()
    try:
        if (platform.system() == "Darwin"):
            model = shellGetOutput("system_profiler SPHardwareDataType |grep 'Processor Name'")
            mhz = shellGetOutput("system_profiler SPHardwareDataType |grep 'Processor Speed'")
        else:
            model = shellGetOutput('grep "model name" /proc/cpuinfo').split('\n')[0]
            mhz = shellGetOutput('grep "cpu MHz" /proc/cpuinfo').split('\n')[0]
        model = model.split(':')[-1].strip()
        mhz = mhz.split(':')[-1].strip()
    except:
        # Could not get processor model 
        print("Could not determine CPU model", sys.exc_info()[0])
    return (model, mhz)

def detectCPUs():
    """
     Detects the number of CPUs on a system. Cribbed from pp.
     """
    # Linux, Unix and MacOS:
    if hasattr(os, "sysconf"):
       if "SC_NPROCESSORS_ONLN" in os.sysconf_names:
           # Linux & Unix:
           ncpus = os.sysconf("SC_NPROCESSORS_ONLN")
           if isinstance(ncpus, int) and ncpus > 0:
               return ncpus
       else: # OSX:
           return int(os.popen2("sysctl -n hw.ncpu")[1].read())
    # Windows:
    if "NUMBER_OF_PROCESSORS" in os.environ:
           ncpus = int(os.environ["NUMBER_OF_PROCESSORS"]);
           if ncpus > 0:
               return ncpus
    return 1 # Default    


================================================
FILE: algorithms/bench/common/runTestsANN.py
================================================
import subprocess
import sys
import random
import os

def addLineToFile(oFile, line):
    with open("oFile", "a+") as file_object:
        # Move read cursor to the start of file.
        file_object.seek(0)
        # If file is not empty then append '\n'
        data = file_object.read(100)
        if len(data) > 0 :
            file_object.write("\n")
        # Append text at the end of file
        file_object.write(line)

def onPprocessors(command,p) :
  if os.environ.has_key("OPENMP"):
    return "OMP_NUM_THREADS="+repr(p)+" " + command
    return command  
  elif os.environ.has_key("CILK"):
    return "CILK_NWORKERS="+repr(p)+" " + command
  else:
    return "PARLAY_NUM_THREADS="+repr(p)+" " + command
  
def shellGetOutput(str) :
  process = subprocess.Popen(str,shell=True,stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
  output, err = process.communicate()
  
  if (len(err) > 0):
      raise NameError(str+"\n"+output+err)
  return output

def stripFloat(val) :
  trunc = float(int(val*1000))/1000
  return str(trunc).rstrip('0')    

def runSingle(runProgram, options, ifile, procs, oFile) :
  comString = "./"+runProgram+" "+options+" "+ifile
  if (procs > 0) :
    comString = onPprocessors(comString,procs)
  out = shellGetOutput(comString)
  nonCommentLines = [s for s in out.split('\n') if len(s)>0]
  for i in nonCommentLines:
    addLineToFile(oFile, i)
  try:
    times = [float(str[str.index(':')+2:]) for str in out.split('\n') if str.startswith("Parlay time: ")]
    return times
  except (ValueError,IndexError):
    raise NameError(comString+"\n"+out)

def runTest(runProgram, checkProgram, dataDir, test, rounds, procs, noOutput, oFile) :
    random.seed()
    outFile="/tmp/ofile%d_%d" %(random.randint(0, 1000000), random.randint(0, 1000000)) 
    [weight, gFileName, qFileName, iFileName, runOptions, checkOptions] = test
    if type(gFileName) is str :
      gFileName = [gFileName]
    shortgFileName = " ".join(gFileName)
    if len(dataDir)>0:
      out = shellGetOutput("cd " + dataDir + "; make " + shortgFileName)
    longgFileName = " ".join(dataDir + "/" + name for name in gFileName)
    if type(qFileName) is str :
      qFileName = [qFileName]
    shortqFileName = " ".join(qFileName)
    if len(dataDir)>0:
      out = shellGetOutput("cd " + dataDir + "; make " + shortqFileName)
    longqFileName = " ".join(dataDir + "/" + name for name in qFileName)
    if type(iFileName) is str :
      iFileName = [iFileName]
    shortiFileName = " ".join(iFileName)
    if len(dataDir)>0:
      out = shellGetOutput("cd " + dataDir + "; make " + shortiFileName)
    longiFileName = " ".join(dataDir + "/" + name for name in iFileName)
    runOptions = runOptions + " -q " + longqFileName
    runOptions = runOptions + " -r " + repr(rounds)
    if (noOutput == 0) :
      runOptions = runOptions + " -o " + outFile
    times = runSingle(runProgram, runOptions, longgFileName, procs, oFile)
    if (noOutput == 0) :
      checkString = ("./" + checkProgram + " " + checkOptions + " "
                     + longiFileName + " " + outFile)
      checkOut = shellGetOutput(checkString)
      nonCommentLines = [s for s in checkOut.split('\n') if len(s)>0]
      for line in nonCommentLines:
        print(line)
        addLineToFile(oFile, line)
      os.remove(outFile)
    ptimes = str([stripFloat(time)
                  for time in times])[1:-1]
    outputStr = ""
    if (len(runOptions) > 0) :
      outputStr = " : " + runOptions
    outStr = repr(weight) + outputStr + " : " + ptimes
    print(outStr)
    addLineToFile(oFile, outStr)
    return [weight,times]
    
def averageTime(times) :
    return sum(times)/len(times)
    

def timeAll(name, runProgram, checkProgram, dataDir, tests, rounds, procs, noOutput,
            problem, oFile) :
  totalTime = 0
  totalWeight = 0
  try:
    results = [runTest(runProgram, checkProgram, dataDir, test, rounds, procs,
                       noOutput, oFile)
               for test in tests]
    totalTimeMean = 0
    totalTimeMin = 0
    totalTimeMedian = 0
    totalWeight = 0
    j = 0
    for (weight,times) in results:
      l = len(times)
      if (l == 0):
        print("Warning, no timed results for", tests[j])
        continue
      times = sorted(times)
      totalTimeMean = totalTimeMean + weight*sum(times)/l
      totalTimeMin = totalTimeMin + weight*times[0]
      totalTimeMedian = totalTimeMedian + weight*times[(l-1)/2]
      totalWeight = totalWeight + weight
      j += 1
    print(name + " : " + repr(procs) +" : " +
          "weighted time, min=" + stripFloat(totalTimeMin/totalWeight) +
          " median=" + stripFloat(totalTimeMedian/totalWeight) +
          " mean=" + stripFloat(totalTimeMean/totalWeight))
    # return 0
  except NameError as x:
    print("TEST TERMINATED ABNORMALLY:\n["+str(x) + "]")
    return 1
  except KeyboardInterrupt:
    return 1


def getOption(str) :
  a = sys.argv
  l = len(a)
  for i in range(1, l) :
    if (a[i] == str) :
      return True
  return False

def getArg(str, default) :
  a = sys.argv
  l = len(a)
  for i in range(1, l) :
    if (a[i] == str and  (i+1 != l)) :
        return sys.argv[i+1]
  return default

def getArgs() :
  noOutput = getOption("-x")
  processors = int(getArg("-p", 0))
  rounds = int(getArg("-r", 1))
  return (noOutput, rounds, processors)

def timeAllArgs(runProgram, problem, checkProgram, dataDir, tests, oFile) :
    (noOutput, rounds, procs) = getArgs()
    name = os.path.basename(os.getcwd())
    timeAll(name, runProgram, checkProgram, dataDir, tests, rounds, procs, noOutput, problem, oFile)


================================================
FILE: algorithms/bench/common/seqDefs
================================================
ifeq (, $(shell which jemalloc-config))
JEMALLOC =
else
JEMALLOCLD = $(shell jemalloc-config --libdir)
JEMALLOC = -L$(JEMALLOCLD) -ljemalloc 
endif

CCFLAGS = -mcx16 -DPARLAY_SEQUENTIAL -O3 -std=c++17 -DNDEBUG
CLFLAGS = $(JEMALLOC)

CC = g++
CFLAGS = $(CCFLAGS)
LFLAGS = $(CLFLAGS)


================================================
FILE: algorithms/bench/common/sequenceIO.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <iostream>
#include <fstream>
#include <string>
#include <cstring>
#include "IO.h"
#include "../parlay/primitives.h"
#include "../parlay/io.h"

namespace parlay {
  using chars = sequence<char>;
};

namespace benchIO {
  using namespace std;
  using parlay::sequence;
  using parlay::tabulate;
  using parlay::make_slice;

  typedef unsigned int uint;
  typedef parlay::sequence<char> charSeq;
  typedef pair<int,int> intPair;
  typedef pair<unsigned int, unsigned int> uintPair;
  typedef pair<unsigned int, int> uintIntPair;
  typedef pair<long,long> longPair;
  typedef pair<charSeq,long> stringIntPair;
  typedef pair<double,double> doublePair;


  enum elementType { none, intType, intPairT, doublePairT,
		     stringIntPairT, doubleT, stringT};
  
  //elementType dataType(long a) { return longT;}
  elementType dataType(long a) { return intType;}
  elementType dataType(int a) { return intType;}
  elementType dataType(uint a) { return intType;}
  elementType dataType(double a) { return doubleT;}
  elementType dataType(charSeq a) { return stringT;}
  elementType dataType(char* a) { return stringT;}
  elementType dataType(intPair a) { return intPairT;}
  elementType dataType(uintPair a) { return intPairT;}
  elementType dataType(uintIntPair a) { return intPairT;}
  elementType dataType(longPair a) { return intPairT;}
  elementType dataType(stringIntPair a) { return stringIntPairT;}
  elementType dataType(doublePair a) { return doublePairT;}

  string seqHeader(elementType dt) {
    switch (dt) {
    case intType: return "sequenceInt";
    case doubleT: return "sequenceDouble";
    case stringT: return "sequenceChar";
    case intPairT: return "sequenceIntPair";
    case stringIntPairT: return "sequenceStringIntPair";
    case doublePairT: return "sequenceDoublePair";
    default: 
      cout << "writeSeqToFile: type not supported" << endl; 
      abort();
    }
  }

  template <typename Range>
  elementType elementTypeFromHeader(Range R) {
    string s(R.begin(), R.end());
    if (s == "sequenceInt") return intType;
    else if (s == "sequenceDouble") return doubleT;
    else if (s == "sequenceChar") return stringT;
    else if (s == "sequenceIntPair") return intPairT;
    else if (s == "sequenceStringIntPair") return stringIntPairT;
    else if (s == "sequenceDoublePair") return doublePairT;
    else return none;
  }

  template <typename Range>
  elementType elementTypeFromString(Range R) {
    string s(R.begin(), R.end());
    if (s == "double") return doubleT;
    else if (s == "string") return stringT;
    else if (s == "int") return intType;
    else return none;
  }

  long read_long(charSeq const &S) {
    return chars_to_long(S);}

  double read_double(charSeq const &S) {
    return chars_to_double(S);}

  using charseq_slice = parlay::slice<const charSeq*, const charSeq*>;
  

  // specialized parsing functions
  template<typename T, typename Range>
  inline typename std::enable_if<std::is_same<T, double>::value, sequence<double>>::type
  parseElements(Range const &S) {
    return tabulate(S.size(), [&] (long i) -> double {return read_double(S[i]);});
  }

  template<typename T, typename Range>
  inline typename std::enable_if<std::is_same<T, int>::value, sequence<int>>::type
  parseElements(Range const &S) {
    return tabulate(S.size(), [&] (long i) -> int {return (int) read_long(S[i]);});
  }

  template<typename T, typename Range>
  inline typename std::enable_if<std::is_same<T, long>::value, sequence<long>>::type
  parseElements(Range const &S) {
    return tabulate(S.size(), [&] (long i) -> long {return (long) read_long(S[i]);});
  }

  template<typename T, typename Range>
  inline typename std::enable_if<std::is_same<T, uint>::value, sequence<uint>>::type
  parseElements(Range const &S) {
    return tabulate(S.size(), [&] (long i) -> uint {return (uint) read_long(S[i]);});
  }

  template<typename T, typename Range>
  inline typename std::enable_if<std::is_same<T, intPair>::value, sequence<intPair>>::type
  parseElements(Range const &S) {
    return tabulate((S.size())/2, [&] (long i) -> intPair {
      return std::make_pair((int) read_long(S[2*i]), (int) read_long(S[2*i+1]));});
  }

  template<typename T, typename Range>
  inline typename std::enable_if<std::is_same<T, uintPair>::value, sequence<uintPair>>::type
  parseElements(Range const &S) {
    return tabulate((S.size())/2, [&] (long i) -> uintPair {
      return std::make_pair((uint) read_long(S[2*i]), (uint) read_long(S[2*i+1]));});
  }

  template<typename T, typename Range>
  inline typename std::enable_if<std::is_same<T, doublePair>::value, sequence<doublePair>>::type
  parseElements(Range const &S) {
    return tabulate((S.size())/2, [&] (long i) -> doublePair {
      return std::make_pair(read_double(S[2*i]), read_double(S[2*i+1]));});
  }

  template<typename T, typename Range>
  inline typename std::enable_if<std::is_same<T, charSeq>::value, sequence<charSeq>>::type
  parseElements(Range const &S) {
    return parlay::to_sequence(S);
  }

  // sequence<stringIntPair> parseElements<stringIntPair>(Range S) {
  //   return sequence<stringIntPair>(0);
  // }  

  template <typename T, typename CharRange>
  void check_header(CharRange& S) {
    T a;
    string header(S[0].begin(), S[0].end());
    string type_str = seqHeader(dataType(a));
    if (header != type_str) {
      cout << "bad header: expected " << type_str << " got " << header << endl;
      abort();
    }
  }

  // reads file, tokenizes and then dispatches to specialized parsing function
  template <typename T>
  sequence<T> readSequenceFromFile(char const *fileName) {
    auto S = get_tokens(fileName);
    check_header<T>(S[0]);
    return parseElements<T>(S.cut(1,S.size()));
  }
  
  template <class T>
  int writeSequenceToFile(sequence<T> const &A, char const *fileName) {
    elementType tp = dataType(A[0]);
    return writeSeqToFile(seqHeader(tp), A, fileName);
  }

};


================================================
FILE: algorithms/bench/common/speculative_for.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include "../parlay/parallel.h"
#include "../parlay/primitives.h"
//#include "atomics.h"
#include <limits>

namespace pbbs {

  // idxT should be able to represent the range of iterations
  // int OK for up to 2^31 iterations
  // unsigned OK if freeze not used
  template <class idxT>
  struct reservation {
    std::atomic<idxT> r;
    static constexpr idxT max_idx = std::numeric_limits<idxT>::max();
    reservation() : r(max_idx) {}
    idxT get() const { return r.load();}
    bool reserve(idxT i) { return parlay::write_min(&r, i, std::less<idxT>());}
    bool reserved() const { return (r.load() < max_idx);}
    void reset() {r = max_idx;}
    void freeze() {r = -1;}
    bool check(idxT i) const { return (r.load() == i);}
    bool checkReset(idxT i) {
      if (r==i) { r = max_idx; return 1;}
      else return 0;
    }
  };

  template <class idxT, class S>
  long speculative_for(S step, idxT s, idxT e, long granularity,
  		     bool hasState=1, long maxTries=-1) {
    if (maxTries < 0) maxTries = 100 + 200*granularity;
    long maxRoundSize = (e-s)/granularity+1;
    long currentRoundSize = maxRoundSize/4;
    // integer types, do not need to be initialized
    auto I = parlay::sequence<idxT>::uninitialized(maxRoundSize);
    auto keep = parlay::sequence<bool>::uninitialized(maxRoundSize);
    parlay::sequence<idxT> Ihold;  // initially empty
    parlay::sequence<S> state;
    if (hasState)
      state = parlay::tabulate(maxRoundSize, [&] (size_t i) -> S {return step;});

    long round = 0;
    long numberDone = s; // number of iterations done
    long numberKeep = 0; // number of iterations to carry to next round
    long totalProcessed = 0; // number done including wasteds tries

    while (numberDone < e) {
      if (round++ > maxTries) 
	throw std::runtime_error("speculative_for: too many iterations, increase maxTries");
      long size = std::min(currentRoundSize, e - numberDone);

      totalProcessed += size;
      size_t loop_granularity = 0;

      if (hasState) {
        parlay::parallel_for (0, size, [&] (size_t i) {
  	  I[i] = (i < numberKeep) ? Ihold[i] : numberDone + i;
  	  keep[i] = state[i].reserve(I[i]);
  	}, loop_granularity);
      } else {
        parlay::parallel_for (0, size, [&] (size_t i) {
  	  I[i] = (i < numberKeep) ? Ihold[i] : numberDone + i;
  	  keep[i] = step.reserve(I[i]);
  	}, loop_granularity);
      }

      if (hasState) {
        parlay::parallel_for (0, size, [&] (size_t i) {
  	  if (keep[i]) keep[i] = !state[i].commit(I[i]);}, loop_granularity);
      } else {
        parlay::parallel_for (0, size, [&] (size_t i) {
  	  if (keep[i]) keep[i] = !step.commit(I[i]);}, loop_granularity);
      }

      // keep iterations that failed for next round
      Ihold = parlay::pack(I.head(size), keep.head(size));
      numberKeep = Ihold.size();
      numberDone += size - numberKeep;

      //std::cout << size << " : " << numberKeep << " : "
      //  << numberDone << " : " << currentRoundSize << std::endl;

      // adjust round size based on number of failed attempts
      if (float(numberKeep)/float(size) > .2)
        currentRoundSize = std::max(currentRoundSize/2,
  				  std::max(maxRoundSize/64 + 1, numberKeep));
      else if (float(numberKeep)/float(size) < .1)
        currentRoundSize = std::min(currentRoundSize * 2, maxRoundSize);
    }
    return totalProcessed;
  }
} // namespace pbbs


================================================
FILE: algorithms/bench/common/time_loop.h
================================================
#include "../parlay/internal/get_time.h"

template<class F, class G, class H>
void time_loop(int rounds, double delay, F initf, G runf, H endf) {
  parlay::internal::timer t;
  // run for delay seconds to "warm things up"
  // will skip if delay is zero
  while (t.total_time() < delay) {
    initf(); runf(); endf();
  } 
  for (int i=0; i < rounds; i++) {
    initf();
    t.start();
    runf();
    t.next("");
    endf();
  }
}


================================================
FILE: algorithms/bench/common/topology.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#ifndef _TOPOLOGY_INCLUDED
#define _TOPOLOGY_INCLUDED

#include <iostream>
#include "geometry.h"

using namespace std;

// *************************************************************
//    TOPOLOGY
// *************************************************************

template <typename point>
struct vertex;

// an unoriented triangle with its three neighbors and 3 vertices
//          vtx[1]
//           o 
//           | \ -> ngh[1]
// ngh[2] <- |   o vtx[0]
//           | / -> ngh[0]
//           o
//         vtx[2]
template <typename point>
struct triangle {
  using tri_t = triangle<point>;
  using vtx_t = vertex<point>;
  tri_t *ngh [3];
  vtx_t *vtx [3];
  size_t id;
  bool initialized;
  char bad;  // used to mark badly shaped triangles
  void setT(tri_t *t1, tri_t *t2, tri_t* t3) {
    ngh[0] = t1; ngh[1] = t2; ngh[2] = t3; }
  void setV(vtx_t *v1, vtx_t *v2, vtx_t *v3) {
    vtx[0] = v1; vtx[1] = v2; vtx[2] = v3; }
  int locate(tri_t *t) {
    for (int i=0; i < 3; i++) {
      //cout << t << ", " << ngh[i] << endl;
      if (ngh[i] == t) return i;
    }
    cout<<"did not locate back pointer in triangulation\n";
    abort(); // did not find
  }
  void update(tri_t *t, tri_t *tn) {
    for (int i=0; i < 3; i++)
      if (ngh[i] == t) {ngh[i] = tn; return;}
    cout<<"did not update\n";
    abort(); // did not find
  }
};

// a vertex pointing to an arbitrary triangle to which it belongs (if any)
template <typename point>
struct vertex {
  using point_t = point;
  using tri = triangle<point>;
  point pt;
  tri *t;
  tri *badT;
  int id;
  int reserve;
  size_t counter;
  void print() {
    cout << id << " (" << pt.x << "," << pt.y << ") " << endl;
  }
  vertex(point p, size_t i) : pt(p), id(i), reserve(-1)
			    , badT(NULL)
  {}
  vertex() {}
};

inline int mod3(int i) {return (i>2) ? i-3 : i;}

// a simplex is just an oriented triangle.  An integer (o)
// is used to indicate which of 3 orientations it is in (0,1,2)
// If boundary is set then it represents the edge through t.ngh[o],
// which is a NULL pointer.
template <typename point>
struct simplex {
  using vtx_t = vertex<point>;
  using tri_t = triangle<point>;
  tri_t *t;
  int o;
  bool boundary;
  simplex(tri_t *tt, int oo) : t(tt), o(oo), boundary(0) {}
  simplex(tri_t *tt, int oo, bool _b) : t(tt), o(oo), boundary(_b) {}
  simplex(vtx_t *v1, vtx_t *v2, vtx_t *v3, tri_t *tt) {
    t = tt;
    t->ngh[0] = t->ngh[1] = t->ngh[2] = NULL;
    t->vtx[0] = v1; v1->t = t;
    t->vtx[1] = v2; v2->t = t;
    t->vtx[2] = v3; v3->t = t;
    o = 0;
    boundary = 0;
  }
  simplex() : t(nullptr), o(0), boundary(false) {}

  void print() {
    if (t == NULL) cout << "NULL simp" << endl;
    else {
      cout << "vtxs=";
      for (int i=0; i < 3; i++) 
	if (t->vtx[mod3(i+o)] != NULL)
	  cout << t->vtx[mod3(i+o)]->id << " (" <<
	    t->vtx[mod3(i+o)]->pt.x << "," <<
	    t->vtx[mod3(i+o)]->pt.y << ") ";
	else cout << "NULL ";
      cout << endl;
    }
  }

  simplex across() {
    tri_t *to = t->ngh[o];
    if (to != NULL) return simplex(to,to->locate(t));
    else return simplex(t,o,1);
  }

  // depending on initial triangle this could be counterclockwise
  simplex rotClockwise() { return simplex(t,mod3(o+1));}

  bool valid() {return (!boundary);}
  bool isTriangle() {return (!boundary);}
  bool isBoundary() {return boundary;}
  
  vtx_t *firstVertex() {return t->vtx[o];}

  bool inCirc(vtx_t *v) {
    if (boundary || t == NULL) return 0;
    return inCircle(t->vtx[0]->pt, t->vtx[1]->pt, 
		    t->vtx[2]->pt, v->pt);
  }

  // the angle facing the across edge
  double farAngle() {
    return angle(t->vtx[mod3(o+1)]->pt,
		 t->vtx[o]->pt,
		 t->vtx[mod3(o+2)]->pt);
  }

  bool outside(vtx_t *v) {
    if (boundary || t == NULL) return 0;
    return counterClockwise(t->vtx[mod3(o+2)]->pt, v->pt, t->vtx[o]->pt);
  }

  // flips two triangles and adjusts neighboring triangles
  void flip() { 
    simplex s = across();
    int o1 = mod3(o+1);
    int os1 = mod3(s.o+1);

    tri_t *t1 = t->ngh[o1];
    tri_t *t2 = s.t->ngh[os1];
    vtx_t *v1 = t->vtx[o1];
    vtx_t *v2 = s.t->vtx[os1];

    t->vtx[o]->t = s.t;
    t->vtx[o] = v2;
    t->ngh[o] = t2;
    if (t2 != NULL) t2->update(s.t,t);
    t->ngh[o1] = s.t;

    s.t->vtx[s.o]->t = t;
    s.t->vtx[s.o] = v1;
    s.t->ngh[s.o] = t1;
    if (t1 != NULL) t1->update(t,s.t);
    s.t->ngh[os1] = t;
  }

  // splits the triangle into three triangles with new vertex v in the middle
  // updates all neighboring simplices
  // ta0 and ta0 are pointers to the memory to use for the two new triangles
  void split(vtx_t* v, tri_t* ta0, tri_t* ta1) {
    v->t = t;
    tri_t *t1 = t->ngh[0]; tri_t *t2 = t->ngh[1]; tri_t *t3 = t->ngh[2];
    vtx_t *v1 = t->vtx[0]; vtx_t *v2 = t->vtx[1]; vtx_t *v3 = t->vtx[2];
    t->ngh[1] = ta0;        t->ngh[2] = ta1;
    t->vtx[1] = v;
    ta0->setT(t2,ta1,t);  ta0->setV(v2,v,v1);
    ta1->setT(t3,t,ta0);  ta1->setV(v3,v,v2);
    if (t2 != NULL) t2->update(t,ta0);      
    if (t3 != NULL) t3->update(t,ta1);
    v2->t = ta0;
  }

  // splits one of the boundaries of a triangle to form two triangles
  // the orientation dictates which edge to split (i.e., t.ngh[o])
  // ta is a pointer to memory to use for the new triangle
  void splitBoundary(vtx_t* v, tri_t* ta) {
    int o1 = mod3(o+1);
    int o2 = mod3(o+2);
    if (t->ngh[o] != NULL) {
      cout << "simplex::splitBoundary: not boundary" << endl; abort();}
    v->t = t;
    tri_t *t2 = t->ngh[o2];
    vtx_t *v1 = t->vtx[o1]; vtx_t *v2 = t->vtx[o2];
    t->ngh[o2] = ta;   t->vtx[o2] = v;
    ta->setT(t2,NULL,t);  ta->setV(v2,v,v1);
    if (t2 != NULL) t2->update(t,ta);      
    v2->t = t;
  }

  // given a vtx v, extends a boundary edge (t.ngh[o]) with an extra 
  // triangle on that edge with apex v.  
  // ta is used as the memory for the triangle
  simplex extend(vtx_t* v, tri_t* ta) {
    if (t->ngh[o] != NULL) {
      cout << "simplex::extend: not boundary" << endl; abort();}
    t->ngh[o] = ta;
    ta->setV(t->vtx[o], t->vtx[mod3(o+2)], v);
    ta->setT(NULL,t,NULL);
    v->t = ta;
    return simplex(ta,0);
  }

};

// this might or might not be needed
// void topologyFromTriangles(triangles<point2d> Tri, vtx** vr, tri** tr);

#endif // _TOPOLOGY_INCLUDED


================================================
FILE: algorithms/bench/common/topology_from_triangles.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include <iostream>
#include <algorithm>
#include "../parlay/hash_table.h"
#include "../parlay/primitives.h"
#include "get_time.h"
#include "atomics.h"
#include "geometry.h"
#include "topology.h"

using parlay::parallel_for;
using parlay::hash64;
using parlay::sequence;
using parlay::tabulate;
using parlay::hashtable;

using std::pair;
using std::cout;
using std::endl;
using std::less;

using triang_t = triangle<point>;
using vertex_t = vertex<point>;
using simplex_t = simplex<point>;
using index_t = int;
using index_pair = pair<index_t,index_t>;
using edge = pair<index_pair, triang_t*>;

// Hash table to store skinny triangles
struct hashEdges {
  using kType = index_pair;
  using eType = edge*;
  eType empty() {return NULL;}
  kType getKey(eType v) { return v->first;}
  size_t hash(kType s) { return hash64(s.first)+3*(hash64(s.second)); }
  int cmp(kType s1, kType s2) {
    return ((s1.first > s2.first) ? 1 : 
	    (s1.first < s2.first) ? -1 : 
	    (s1.second > s2.second) ? 1 :
	    (s1.second < s2.second) ? -1 : 0);
  }
  bool cas(eType* p, eType o, eType n) {
    return pbbs::atomic_compare_and_swap(p, o, n);
  }
  bool replaceQ(eType s, eType s2) {return 0;}
};

using EdgeTable = hashtable<hashEdges>;

EdgeTable makeEdgeTable(size_t m) {
  return EdgeTable(m,hashEdges());}

std::pair<sequence<triang_t>,sequence<vertex_t>>
topology_from_triangles(triangles<point> &Tri, size_t extra_points = 0) {
  size_t n = Tri.numPoints();
  size_t m = Tri.numTriangles();

  auto V = tabulate(n + extra_points, [&] (size_t i) {
    return (i < n) ? vertex_t(Tri.P[i], i) : vertex_t();});

  sequence<triang_t> Triangs(m + 2 * extra_points);
  sequence<edge> E(m*3);
  EdgeTable ET = makeEdgeTable(m*6);
  parallel_for (0, m, [&] (size_t i) {
    for (int j=0; j<3; j++) {
      E[i*3 + j] = edge(index_pair(Tri.T[i][j], Tri.T[i][(j+1)%3]), &Triangs[i]);
      ET.insert(&E[i*3+j]);
      Triangs[i].vtx[(j+2)%3] = &V[Tri.T[i][j]];
    }});

  parallel_for (0, m, [&] (size_t i) {
    Triangs[i].id = i;
    Triangs[i].initialized = 1;
    Triangs[i].bad = 0;
    for (int j=0; j<3; j++) {
      index_pair key = {Tri.T[i][(j+1)%3], Tri.T[i][j]};
      edge *Ed = ET.find(key);
      if (Ed != NULL) Triangs[i].ngh[j] = Ed->second;
      else {
	Triangs[i].ngh[j] = NULL;
	//Triangs[i].vtx[j]->boundary = 1;
	//Triangs[i].vtx[(j+2)%3]->boundary = 1;
      }
    }
  });
  return std::pair(std::move(Triangs),std::move(V));
}

// Note that this is not currently a complete test of correctness
// For example it would allow a set of disconnected triangles, or even no
// triangles
bool check_delaunay(sequence<triang_t> &Triangles, size_t boundary_size) {
  size_t n = Triangles.size();
  sequence<size_t> boundary_count(n, 0);
  size_t insideOutError = n;
  size_t inCircleError = n;
  parallel_for (0, n, [&] (size_t i) {
    if (Triangles[i].initialized) {
      simplex_t t = simplex(&Triangles[i],0);
      for (int j=0; j < 3; j++) {
	simplex_t a = t.across();
	if (a.valid()) {
	  vertex_t* v = a.rotClockwise().firstVertex();

          // Check that the neighbor is outside the triangle
	  if (!t.outside(v)) {
	    double vz = triAreaNormalized(t.t->vtx[(t.o+2)%3]->pt, 
					  v->pt, t.t->vtx[t.o]->pt);
	    // allow for small error
	    if (vz < -1e-10) pbbs::write_min(&insideOutError, i, less<size_t>());
	  }

          // Check that the neighbor is not in circumcircle of the triangle
	  if (t.inCirc(v)) {
	    double vz = inCircleNormalized(t.t->vtx[0]->pt, t.t->vtx[1]->pt, 
					   t.t->vtx[2]->pt, v->pt);
	    // allow for small error
	    if (vz > 1e-10) pbbs::write_min(&inCircleError, i, less<size_t>());
	  }
	} else boundary_count[i]++;
	t = t.rotClockwise();
      }
    }
  });
  // if (boundary_size != reduce(boundary_count))
  //   cout << "Wrong boundary size: should be " << boundary_size 
  // 	 << " is " << reduce(boundary_count) << endl;

  if (insideOutError < n) {
    cout << "delaunayCheck: neighbor inside triangle at triangle " 
	 << inCircleError << endl;
    return 1;
  }
  if (inCircleError < n) {
    cout << "In Circle Violation at triangle " << inCircleError << endl;
    return 1;
  }

  return 0;
}


================================================
FILE: algorithms/bench/get_time.h
================================================
#pragma once

#include <stdlib.h>
#include <sys/time.h>
#include <iomanip>
#include <iostream>
#include <string>

namespace cpam {

struct timer {
  double total_time;
  double last_time;
  bool on;
  std::string name;
  struct timezone tzp;

  timer(std::string name = "PBBS time", bool _start = true)
  : total_time(0.0), on(false), name(name), tzp({0,0}) {
    if (_start) start();
  }

  double get_time() {
    timeval now;
    gettimeofday(&now, &tzp);
    return ((double) now.tv_sec) + ((double) now.tv_usec)/1000000.;
  }

  void start () {
    on = 1;
    last_time = get_time();
  }

  double stop () {
    on = 0;
    double d = (get_time()-last_time);
    total_time += d;
    return d;
  }

  void reset() {
     total_time=0.0;
     on=0;
  }

  double get_total() {
    if (on) return total_time + get_time() - last_time;
    else return total_time;
  }

  double get_next() {
    if (!on) return 0.0;
    double t = get_time();
    double td = t - last_time;
    total_time += td;
    last_time = t;
    return td;
  }

  void report(double time, std::string str) {
    std::ios::fmtflags cout_settings = std::cout.flags();
    std::cout.precision(4);
    std::cout << std::fixed;
    std::cout << name << ": ";
    if (str.length() > 0)
      std::cout << str << ": ";
    std::cout << time << std::endl;
    std::cout.flags(cout_settings);
  }

  void total() {
    report(get_total(),"total");
    total_time = 0.0;
  }

  void reportTotal(std::string str) {
    report(get_total(), str);
  }

  void next(std::string str) {
    if (on) report(get_next(), str);
  }
};

}  // namespace cpam


================================================
FILE: algorithms/bench/neighborsTime.C
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include <iostream>
#include <algorithm>
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parse_command_line.h"
#include "time_loop.h"
#include "../utils/NSGDist.h"
#include "../utils/euclidian_point.h"
#include "../utils/point_range.h"
#include "../utils/mips_point.h"
#include "../utils/graph.h"

#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

using namespace parlayANN;

// *************************************************************
//  TIMING
// *************************************************************

using uint = unsigned int;


template<typename Point, typename PointRange, typename indexType>
void timeNeighbors(Graph<indexType> &G,
		   PointRange &Query_Points, long k,
		   BuildParams &BP, char* outFile,
		   groundTruth<indexType> GT, char* res_file, bool graph_built, PointRange &Points)
{


    time_loop(1, 0,
      [&] () {},
      [&] () {
        ANN<Point, PointRange, indexType>(G, k, BP, Query_Points, GT, res_file, graph_built, Points);
      },
      [&] () {});

    if(outFile != NULL) {
      G.save(outFile);
    }


}

int main(int argc, char* argv[]) {
    commandLine P(argc,argv,
    "[-a <alpha>] [-d <delta>] [-R <deg>]"
        "[-L <bm>] [-k <k> ]  [-gt_path <g>] [-query_path <qF>]"
        "[-graph_path <gF>] [-graph_outfile <oF>] [-res_path <rF>]" "[-num_passes <np>]"
        "[-memory_flag <algoOpt>] [-mst_deg <q>] [-num_clusters <nc>] [-cluster_size <cs>]"
        "[-data_type <tp>] [-dist_func <df>] [-base_path <b>] <inFile>");

  char* iFile = P.getOptionValue("-base_path");
  char* oFile = P.getOptionValue("-graph_outfile");
  char* gFile = P.getOptionValue("-graph_path");
  char* qFile = P.getOptionValue("-query_path");
  char* cFile = P.getOptionValue("-gt_path");
  char* rFile = P.getOptionValue("-res_path");
  char* vectype = P.getOptionValue("-data_type");
  long Q = P.getOptionIntValue("-Q", 0);
  long R = P.getOptionIntValue("-R", 0);
  if(R<0) P.badArgument();
  long L = P.getOptionIntValue("-L", 0);
  if(L<0) P.badArgument();
  long MST_deg = P.getOptionIntValue("-mst_deg", 0);
  if(MST_deg < 0) P.badArgument();
  long num_clusters = P.getOptionIntValue("-num_clusters", 0);
  if(num_clusters<0) P.badArgument();
  long cluster_size = P.getOptionIntValue("-cluster_size", 0);
  if(cluster_size<0) P.badArgument();
  long k = P.getOptionIntValue("-k", 0);
  if (k > 1000 || k < 0) P.badArgument();
  double alpha = P.getOptionDoubleValue("-alpha", 1.0);
  int num_passes = P.getOptionIntValue("-num_passes", 1);
  int two_pass = P.getOptionIntValue("-two_pass", 0);
  if(two_pass > 1 | two_pass < 0) P.badArgument();
  if (two_pass == 1) num_passes = 2;
  double delta = P.getOptionDoubleValue("-delta", 0);
  if(delta<0) P.badArgument();
  char* dfc = P.getOptionValue("-dist_func");
  int quantize = P.getOptionIntValue("-quantize_bits", 0);
  int quantize_build = P.getOptionIntValue("-quantize_mode", 0);
  bool verbose = P.getOption("-verbose");
  bool graph_stats = P.getOption("-graph_stats");
  bool normalize = P.getOption("-normalize");
  double trim = P.getOptionDoubleValue("-trim", 0.0); // not used
  bool self = P.getOption("-self");
  int rerank_factor = P.getOptionIntValue("-rerank_factor", 10);
  bool range = P.getOption("-range");
  bool is_early_stop = P.getOption("-early_stop");
  char* sm = P.getOptionValue("-search_mode");
  double esr = P.getOptionDoubleValue("-early_stopping_radius", 0);
  double radius  = P.getOptionDoubleValue("-r", 0.0);
  double batch_factor = P.getOptionDoubleValue("-batch_factor", .125);
  
  // this integer represents the number of random edges to start with for
  // inserting in a single batch per round
  int single_batch = P.getOptionIntValue("-single_batch", 0);
    
  std::string df = std::string(dfc);
  std::string tp = std::string(vectype);

  std::string searchType = (sm == nullptr) ? "" : std::string(sm);
  rangeQueryType rtype = Beam;

  if (searchType == "doubling") {
    rtype = Doubling;
    std::cout << "Using doubling range search" << std::endl;
  } else if (searchType == "greedy") {
    rtype = Greedy;
    std::cout << "Using greedy range search" << std::endl;
  }
  else if (searchType == "beam") {
    rtype = Beam;
    std::cout << "Using beam range search" << std::endl;
  }
  else rtype = None;
  
  BuildParams BP = BuildParams(R, L, alpha, num_passes,
                               num_clusters, cluster_size, MST_deg, delta,
                               verbose, quantize_build,
                               self, single_batch,
                               Q, trim,
                               rerank_factor, batch_factor,
                               is_early_stop, esr,
                               rtype, radius, graph_stats);
  long maxDeg = BP.max_degree();

  if((tp != "uint8") && (tp != "int8") && (tp != "float")){
    std::cout << "Error: vector type not specified correctly, specify int8, uint8, or float" << std::endl;
    abort();
  }

  if(df != "Euclidian" && df != "mips"){
    std::cout << "Error: specify distance type Euclidian or mips" << std::endl;
    abort();
  }

  bool graph_built = (gFile != NULL);

  groundTruth<uint> GT = groundTruth<uint>(cFile);
  
  if(tp == "float"){
    if(df == "Euclidian"){
      PointRange<Euclidian_Point<float>> Points(iFile);
      PointRange<Euclidian_Point<float>> Query_Points(qFile);
      if (normalize) {
        std::cout << "normalizing data" << std::endl;
        for (int i=0; i < Points.size(); i++) 
          Points[i].normalize();
        for (int i=0; i < Query_Points.size(); i++) 
          Query_Points[i].normalize();
      }
      Graph<unsigned int> G; 
      if(gFile == NULL) G = Graph<unsigned int>(maxDeg, Points.size());
      else G = Graph<unsigned int>(gFile);
      if (quantize == 8) {
        std::cout << "quantizing data to 1 byte" << std::endl;
        using QT = uint8_t;
        using QPoint = Euclidian_Point<QT>;
        using PR = PointRange<QPoint>;
        PR Points_(Points);
        PR Query_Points_(Query_Points, Points_.params);
        timeNeighbors<QPoint, PR, uint>(G, Query_Points_, k, BP, oFile, GT, rFile, graph_built, Points_);
      } else if (quantize == 16) {
        std::cout << "quantizing data to 2 bytes" << std::endl;
        using Point = Euclidian_Point<uint16_t>;
        using PR = PointRange<Point>;
        PR Points_(Points);
        PR Query_Points_(Query_Points, Points_.params);
        timeNeighbors<Point, PR, uint>(G, Query_Points_, k, BP, oFile, GT, rFile, graph_built, Points_);
      } else {
        using Point = Euclidian_Point<float>;
        using PR = PointRange<Point>;
        timeNeighbors<Point, PR, uint>(G, Query_Points, k, BP, oFile, GT, rFile, graph_built, Points);
      }
    } else if(df == "mips"){
      PointRange<Mips_Point<float>> Points(iFile);
      PointRange<Mips_Point<float>> Query_Points(qFile);
      if (normalize) {
        std::cout << "normalizing data" << std::endl;
        for (int i=0; i < Points.size(); i++) 
          Points[i].normalize();
        for (int i=0; i < Query_Points.size(); i++) 
          Query_Points[i].normalize();
      }
      Graph<unsigned int> G; 
      if(gFile == NULL) G = Graph<unsigned int>(maxDeg, Points.size());
      else G = Graph<unsigned int>(gFile);
      if (quantize == 8) {
        std::cout << "quantizing data to 1 byte" << std::endl;
        using QT = int8_t;
        using Point = Quantized_Mips_Point<8>;
        using PR = PointRange<Point>;
        PR Points_(Points);
        PR Query_Points_(Query_Points, Points_.params);
        timeNeighbors<Point, PR, uint>(G, Query_Points_, k, BP, oFile, GT, rFile, graph_built, Points_);
      } else if (quantize == 16) {
        std::cout << "quantizing data to 2 bytes" << std::endl;
        using QT = int16_t;
        using Point = Quantized_Mips_Point<16>;
        using PR = PointRange<Point>;
        PR Points_(Points);
        PR Query_Points_(Query_Points, Points_.params);
        timeNeighbors<Point, PR, uint>(G, Query_Points_, k, BP, oFile, GT, rFile, graph_built, Points_);
      } else {
        using Point = Mips_Point<float>;
        using PR = PointRange<Point>;
        timeNeighbors<Point, PR, uint>(G, Query_Points, k, BP, oFile, GT, rFile, graph_built, Points);
      }
    }
  } else if(tp == "uint8"){
    if(df == "Euclidian"){
      PointRange<Euclidian_Point<uint8_t>> Points(iFile);
      PointRange<Euclidian_Point<uint8_t>> Query_Points(qFile);
      Graph<unsigned int> G; 
      if(gFile == NULL) G = Graph<unsigned int>(maxDeg, Points.size());
      else G = Graph<unsigned int>(gFile);
      timeNeighbors<Euclidian_Point<uint8_t>, PointRange<Euclidian_Point<uint8_t>>, uint>(G, Query_Points, k, BP, 
        oFile, GT, rFile, graph_built, Points);
    } else if(df == "mips"){
      PointRange<Mips_Point<uint8_t>> Points(iFile);
      PointRange<Mips_Point<uint8_t>> Query_Points(qFile);
      Graph<unsigned int> G; 
      if(gFile == NULL) G = Graph<unsigned int>(maxDeg, Points.size());
      else G = Graph<unsigned int>(gFile);
      timeNeighbors<Mips_Point<uint8_t>, PointRange<Mips_Point<uint8_t>>, uint>(G, Query_Points, k, BP, 
        oFile, GT, rFile, graph_built, Points);
    }
  } else if(tp == "int8"){
    if(df == "Euclidian"){
      PointRange<Euclidian_Point<int8_t>> Points(iFile);
      PointRange<Euclidian_Point<int8_t>> Query_Points(qFile);
      Graph<unsigned int> G; 
      if(gFile == NULL) G = Graph<unsigned int>(maxDeg, Points.size());
      else G = Graph<unsigned int>(gFile);
      timeNeighbors<Euclidian_Point<int8_t>, PointRange<Euclidian_Point<int8_t>>, uint>(G, Query_Points, k, BP,
        oFile, GT, rFile, graph_built, Points);
    } else if(df == "mips"){
      PointRange<Mips_Point<int8_t>> Points(iFile);
      PointRange<Mips_Point<int8_t>> Query_Points(qFile);
      Graph<unsigned int> G; 
      if(gFile == NULL) G = Graph<unsigned int>(maxDeg, Points.size());
      else G = Graph<unsigned int>(gFile);
      timeNeighbors<Mips_Point<int8_t>, PointRange<Mips_Point<int8_t>>, uint>(G, Query_Points, k, BP,
        oFile, GT, rFile, graph_built, Points);
    }
  }
  
  return 0;
}


================================================
FILE: algorithms/bench/parallelDefsANN
================================================
ifeq (, $(shell which jemalloc-config))
JEMALLOC =
else
JEMALLOCLD = $(shell jemalloc-config --libdir)
JEMALLOC = -L$(JEMALLOCLD) -ljemalloc 
endif

CCFLAGS = -mcx16 -O3 -std=c++17 -march=native -DNDEBUG -I .
CLFLAGS = -ldl $(JEMALLOC)

OMPFLAGS = -DPARLAY_OPENMP -fopenmp
CILKFLAGS = -DPARLAY_CILK -fcilkplus
PBBFLAGS = -DHOMEGROWN -pthread

ifdef OPENMP
CC = g++
CFLAGS = $(OMPFLAGS) $(CCFLAGS)
LFLAGS = $(OMPFLAGS) $(CLFLAGS)

else ifdef CILK
CC = g++
CFLAGS = $(CILKFLAGS) $(CCFLAGS)
LFLAGS = $(CILKFLAGS) $(CLFLAGS)

else
CC = g++
CFLAGS = $(PBBFLAGS) $(CCFLAGS)
LFLAGS = $(PBBFLAGS) $(CLFLAGS)
endif


================================================
FILE: algorithms/bench/parse_command_line.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <iostream>
#include <fstream>
#include <string>
#include <cstring>

struct commandLine {
  int argc;
  char** argv;
  std::string comLine;
  commandLine(int _c, char** _v, std::string _cl)
    : argc(_c), argv(_v), comLine(_cl) {
      if (getOption("-h") || getOption("-help"))
	badArgument();
    }

  commandLine(int _c, char** _v)
    : argc(_c), argv(_v), comLine("bad arguments") { }

  void badArgument() {
    std::cout << "usage: " << argv[0] << " " << comLine << std::endl;
    exit(0);
  }

  // get an argument
  // i is indexed from the last argument = 0, second to last indexed 1, ..
  char* getArgument(int i) {
    if (argc < 2+i) badArgument();
    return argv[argc-1-i];
  }

  // looks for two filenames
  std::pair<char*,char*> IOFileNames() {
    if (argc < 3) badArgument();
    return std::pair<char*,char*>(argv[argc-2],argv[argc-1]);
  }

  std::pair<size_t,char*> sizeAndFileName() {
    if (argc < 3) badArgument();
    return std::pair<size_t,char*>(std::atoi(argv[argc-2]),(char*) argv[argc-1]);
  }

  bool getOption(std::string option) {
    for (int i = 1; i < argc; i++)
      if ((std::string) argv[i] == option) return true;
    return false;
  }

  char* getOptionValue(std::string option) {
    for (int i = 1; i < argc-1; i++)
      if ((std::string) argv[i] == option) return argv[i+1];
    return NULL;
  }

  std::string getOptionValue(std::string option, std::string defaultValue) {
    for (int i = 1; i < argc-1; i++)
      if ((std::string) argv[i] == option) return (std::string) argv[i+1];
    return defaultValue;
  }

  long getOptionLongValue(std::string option, long defaultValue) {
    for (int i = 1; i < argc-1; i++)
      if ((std::string) argv[i] == option) {
	long r = atol(argv[i+1]);
	if (r < 0) badArgument();
	return r;
      }
    return defaultValue;
  }

  int getOptionIntValue(std::string option, int defaultValue) {
    for (int i = 1; i < argc-1; i++)
      if ((std::string) argv[i] == option) {
	int r = atoi(argv[i+1]);
	if (r < 0) badArgument();
	return r;
      }
    return defaultValue;
  }

  double getOptionDoubleValue(std::string option, double defaultValue) {
    for (int i = 1; i < argc-1; i++)
      if ((std::string) argv[i] == option) {
	double val;
	if (sscanf(argv[i+1], "%lf",  &val) == EOF) {
	  badArgument();
	}
	return val;
      }
    return defaultValue;
  }

};


================================================
FILE: algorithms/bench/time_loop.h
================================================
#include "get_time.h"

#ifndef TIMELOOP
#define TIMELOOP

template<class F, class G, class H>
void time_loop(int rounds, double delay, F initf, G runf, H endf) {
  parlay::internal::timer t;
  // run for delay seconds to "warm things up"
  // will skip if delay is zero
  while (t.total_time() < delay) {
    initf(); runf(); endf();
  } 
  for (int i=0; i < rounds; i++) {
    initf();
    t.start();
    runf();
    t.next("");
    endf();
  }
}

#endif


================================================
FILE: algorithms/pyNNDescent/CMakeLists.txt
================================================
add_executable(neighbors-pynndescent ../bench/neighborsTime.C)
  target_link_libraries(neighbors-pynndescent PRIVATE parlay)
  target_precompile_headers(neighbors-pynndescent PRIVATE neighbors.h)


================================================
FILE: algorithms/pyNNDescent/Makefile
================================================
include ../bench/parallelDefsANN

REQUIRE =  ../utils/beamSearch.h pynn_index.h ../utils/graph.h clusterPynn.h
BENCH = neighbors

include ../bench/MakeBench

================================================
FILE: algorithms/pyNNDescent/clusterPynn.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <math.h>

#include <algorithm>
#include <functional>
#include <queue>
#include <random>
#include <set>

#include "../HCNNG/clusterEdge.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/random.h"
#include "../utils/union.h"

namespace parlayANN {
  
template<typename Point, typename PointRange, typename indexType>
struct clusterPID {
  using distanceType = typename Point::distanceType;
	using PR = PointRange;
  using edge = std::pair<indexType , indexType >;
  using pid = std::pair<indexType , distanceType>;

  clusterPID() {}

  parlay::sequence<parlay::sequence<pid>> intermediate_edges;

  void naive_neighbors(PR &Points,
                       parlay::sequence<size_t>& active_indices,
                       long maxK) {
    size_t n = active_indices.size();
    parlay::parallel_for(0, n, [&](size_t i) {
      auto less = [&](pid a, pid b) { return a.second < b.second; };
      std::priority_queue<pid, std::vector<pid>, decltype(less)> Q(less);
      size_t index = active_indices[i];
      // tabulate all-pairs distances between the elements in the leaf
      for (indexType  j = 0; j < n; j++) {
        if (j != i) {
          distanceType dist = Points[index].distance(Points[active_indices[j]]);
          pid e = std::make_pair(active_indices[j], dist);
          if (Q.size() >= maxK) {
            distanceType topdist = Q.top().second;
            if (dist < topdist) {
              Q.pop();
              Q.push(e);
            }
          } else {
            Q.push(e);
          }
        }
      }
      size_t q = Q.size();
      parlay::sequence<pid> sorted_edges(q);
      for (indexType  j = 0; j < q; j++) {
        sorted_edges[j] = Q.top();
        Q.pop();
      }
      auto rev_edges = parlay::reverse(sorted_edges);
      auto [new_best, changed] =
          seq_union_bounded(intermediate_edges[index], rev_edges, maxK, less);
      intermediate_edges[index] = new_best;
    });
  }


  void random_clustering(PR &Points,
                         parlay::sequence<size_t>& active_indices,
                         parlay::random& rnd, long cluster_size,
                         long K) {
    if (active_indices.size() <= cluster_size)
      naive_neighbors(Points, active_indices, K);
    else {
      auto [f, s] = select_two_random(active_indices, rnd);

      auto left_rnd = rnd.fork(0);
      auto right_rnd = rnd.fork(1);

      if (Points[f]==Points[s]) {
        parlay::sequence<size_t> closer_first;
        parlay::sequence<size_t> closer_second;
        for (indexType i = 0; i < active_indices.size(); i++) {
          if (i < active_indices.size() / 2)
            closer_first.push_back(active_indices[i]);
          else
            closer_second.push_back(active_indices[i]);
        }
        auto left_rnd = rnd.fork(0);
        auto right_rnd = rnd.fork(1);
        parlay::par_do(
            [&]() {
              random_clustering(Points, closer_first, left_rnd, cluster_size,
                                K);
            },
            [&]() {
              random_clustering(Points, closer_second, right_rnd, cluster_size,
                                K);
            });
      } else {
        // Split points based on which of the two points are closer.
        auto closer_first =
            parlay::filter(parlay::make_slice(active_indices), [&](size_t ind) {
              distanceType dist_first = Points[ind].distance(Points[f]);
              distanceType dist_second = Points[ind].distance(Points[s]);
              return dist_first <= dist_second;
            });

        auto closer_second =
            parlay::filter(parlay::make_slice(active_indices), [&](size_t ind) {
              distanceType dist_first = Points[ind].distance(Points[f]);
              distanceType dist_second = Points[ind].distance(Points[s]);
              return dist_second < dist_first;
            });


        parlay::par_do(
            [&]() {
              random_clustering(Points, closer_first, left_rnd, cluster_size, 
                                K);
            },
            [&]() {
              random_clustering(Points, closer_second, right_rnd, cluster_size,
                                  K);
        });

      }
    }
  }

  void random_clustering_wrapper(PR &Points,
                                 long cluster_size, long K) {
    std::random_device rd;
    std::mt19937 rng(rd());
    std::uniform_int_distribution<indexType> uni(0, Points.size());
    parlay::random rnd(uni(rng));
    auto active_indices =
        parlay::tabulate(Points.size(), [&](size_t i) { return i; });
    random_clustering(Points, active_indices, rnd, cluster_size, K);
  }

  void multiple_clustertrees(PR &Points,
                             long cluster_size, long num_clusters,
                             long K,
                             parlay::sequence<parlay::sequence<pid>>& old_nbh) {
    intermediate_edges = parlay::sequence<parlay::sequence<pid>>(Points.size());
    for (long i = 0; i < num_clusters; i++) {
      random_clustering_wrapper(Points, cluster_size, K);
      std::cout << "Cluster " << i << std::endl; 
    }
    parlay::parallel_for(0, Points.size(),
                         [&](size_t i) { old_nbh[i] = intermediate_edges[i]; });
  }
};

} // end namespace


================================================
FILE: algorithms/pyNNDescent/neighbors.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include <algorithm>
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/random.h"
#include "../utils/NSGDist.h"  
#include "../utils/types.h"
#include "pynn_index.h"
#include "../utils/beamSearch.h"  
#include "../utils/stats.h"
#include "../utils/parse_results.h"
#include "../utils/check_nn_recall.h"

namespace parlayANN {

template<typename Point, typename PointRange, typename indexType>
void ANN(Graph<indexType> &G, long k, BuildParams &BP,
         PointRange &Query_Points,
         groundTruth<indexType> GT, char *res_file,
         bool graph_built, PointRange &Points) {
  parlay::internal::timer t("ANN"); 
  {
    using findex = pyNN_index<Point, PointRange, indexType>;
    double idx_time;
    long K = BP.R;
    if(!graph_built){
      findex I(K, BP.delta);
      I.build_index(G, Points, BP.cluster_size, BP.num_clusters, BP.alpha);
      idx_time = t.next_time();
    }else {idx_time=0;}

    std::string name = "pyNNDescent";
    std::string params = "K = " + std::to_string(K);
    auto [avg_deg, max_deg] = graph_stats_(G);
    Graph_ G_(name, params, G.size(), avg_deg, max_deg, idx_time);
    G_.print();
    if(Query_Points.size() != 0)
      search_and_parse(G_, G, Points, Query_Points, GT, res_file, k, BP.verbose);
  };
}

} // end namespace


================================================
FILE: algorithms/pyNNDescent/pynn_index.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include <algorithm>
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/random.h"
#include "parlay/internal/get_time.h"
#include <random>
#include <set>
#include <queue>
#include <math.h>
#include "clusterPynn.h"

namespace parlayANN {

template<typename Point, typename PointRange, typename indexType>
struct pyNN_index{
    using distanceType = typename Point::distanceType;
	using GraphI = Graph<indexType>;
	using PR = PointRange;
    using edge = std::pair<indexType, indexType>;
    using pid = std::pair<indexType,distanceType>;
    using labelled_edge = std::pair<indexType, pid>;

    long K;
	double delta;

    static constexpr auto less = [] (edge a, edge b) {return a.second < b.second;};

    pyNN_index(long md, double Delta) : K(md), delta(Delta) {}

    parlay::sequence<parlay::sequence<pid>> old_neighbors;

    void push_into_queue(std::priority_queue<edge, std::vector<edge>, decltype(less)> &Q, edge p){
        if(Q.size() < 2*K){
            Q.push(p);
        } else{
            indexType highest_p = Q.top().second;
            if(p.second < highest_p){
                Q.pop();
                Q.push(p);
            }
        }
    }

    parlay::sequence<int> nn_descent(PR &Points, parlay::sequence<int> &changed){
        auto new_changed = parlay::sequence<int>(Points.size(), 0);
        auto rev = reverse_graph();
        parlay::random_generator gen;
        size_t n=Points.size();
        std::uniform_int_distribution<indexType> dis(0, n-1);
        int batch_size = 100000;
        std::pair<indexType, parlay::sequence<indexType>> *begin;
		std::pair<indexType, parlay::sequence<indexType>> *end = rev.begin();
		int counter = 0;
		while(end != rev.end()){
			counter++;
			begin = end;
			int remaining = rev.end() - end;
			end += std::min(remaining, batch_size);
			nn_descent_chunk(Points, changed, new_changed, begin, end);
		}
        return new_changed;
    }

    void nn_descent_chunk(PR &Points, parlay::sequence<int> &changed, 
		parlay::sequence<int> &new_changed, std::pair<indexType, parlay::sequence<indexType>> *begin, 
		std::pair<indexType, parlay::sequence<indexType>> *end){
        size_t stride = end - begin;
	auto less = [&] (pid a, pid b) {return a.second < b.second;};
	auto grouped_labelled = parlay::tabulate(stride, [&] (size_t i){
            indexType index = (begin+i)->first;
            std::set<indexType> to_filter;
            to_filter.insert(index);
            for(indexType j=0; j<old_neighbors[index].size(); j++){
                to_filter.insert(old_neighbors[index][j].first);
            }
            auto f = [&] (indexType a) {return (to_filter.find(a) == to_filter.end());};
            auto filtered_candidates = parlay::filter((begin+i)->second, f);
	    parlay::sequence<labelled_edge> edges;
	    edges.reserve(K*2);
	    for(indexType l=0; l<filtered_candidates.size(); l++){
                indexType j=filtered_candidates[l];
		distanceType j_max = old_neighbors[j][old_neighbors[j].size()-1].second;
		for(indexType m=l+1; m<filtered_candidates.size(); m++){
                    indexType k=filtered_candidates[m];
		    if (changed[j] || changed[k]) {
              distanceType dist = Points[j].distance(Points[k]);
		      distanceType k_max = old_neighbors[k][old_neighbors[k].size()-1].second;
		      if(dist < j_max) edges.push_back(std::make_pair(j, std::make_pair(k, dist)));
		      if(dist < k_max) edges.push_back(std::make_pair(k, std::make_pair(j, dist)));
		    }
		}
	    }
            for(indexType l=0; l<old_neighbors[index].size(); l++){
                indexType j = old_neighbors[index][l].first;
                for(const indexType& k : filtered_candidates){
		  if (changed[index] || changed[k]) {
                    distanceType dist = Points[j].distance(Points[k]);
                    distanceType j_max = old_neighbors[j][old_neighbors[j].size()-1].second;
                    distanceType k_max = old_neighbors[k][old_neighbors[k].size()-1].second;
                    if(dist < j_max) edges.push_back(std::make_pair(j, std::make_pair(k, dist)));
                    if(dist < k_max) edges.push_back(std::make_pair(k, std::make_pair(j, dist)));
		  }
                }
            }
			return edges;
								 }, 1);
		auto candidates = parlay::group_by_key(parlay::flatten(grouped_labelled));
        parlay::parallel_for(0, candidates.size(), [&] (size_t i){
            auto less2 = [&] (pid a, pid b) {
                if(a.second < b.second) return true;
                else if(a.second == b.second){
                    if(a.first < b.first) return true;
                }
                return false;
            };
            parlay::sort_inplace(candidates[i].second, less2);
            indexType cur_index=std::numeric_limits<unsigned int>::max();
            parlay::sequence<pid> filtered_candidates;
            for(const pid& p : candidates[i].second){
                if(p.first!=cur_index){
                    filtered_candidates.push_back(p);
                    cur_index = p.first;
                }
            }
            indexType index = candidates[i].first;
            auto less3 = [&] (pid a, pid b) {return a.second < b.second;};
            auto [new_edges, change] = seq_union_bounded(old_neighbors[index], filtered_candidates, K, less3);
            if(change){
                new_changed[index]=1;
                old_neighbors[index]=new_edges;
            }
        });
    }

    parlay::sequence<std::pair<indexType, parlay::sequence<indexType>>> reverse_graph(){
        parlay::sequence<parlay::sequence<edge>> to_group = parlay::tabulate(old_neighbors.size(), [&] (size_t i){
            size_t s = old_neighbors[i].size();
            parlay::sequence<edge> e(s);
            for(indexType j=0; j<s; j++){
                e[j] = std::make_pair(old_neighbors[i][j].first, (int) i);
            }
            return e; 
        });
        auto sorted_graph =  parlay::group_by_key(parlay::flatten(to_group));
        parlay::parallel_for(0, sorted_graph.size(), [&] (size_t i){
            auto shuffled = parlay::remove_duplicates(parlay::random_shuffle(sorted_graph[i].second, i));
            indexType upper_bound = std::min((long) shuffled.size(), K);
            auto truncated = parlay::tabulate(upper_bound, [&] (size_t j){
                return shuffled[j];
            });
            sorted_graph[i].second = truncated;
        });
        return sorted_graph;
    }

    int nn_descent_wrapper(PR &Points){
		size_t n = Points.size();
		parlay::sequence<int> changed = parlay::tabulate(n, [&] (size_t i) {return 1;});
		int rounds = 0;
        int max_rounds = std::max(10, (int) log2(Points.dimension()));
        if(Points.dimension()==256) max_rounds=20; //hack for ssnpp
		while(parlay::reduce(changed) >= delta*n && rounds < max_rounds){
			auto new_changed = nn_descent(Points, changed);
			changed = new_changed;
			rounds++;
            std::cout << parlay::reduce(new_changed) << " elements changed" << std::endl;
			std::cout << "Round " << rounds << " of " <<  max_rounds << " completed" << std::endl; 
		}

		std::cout << "descent converged in " << rounds << " rounds";
        if(rounds < max_rounds) std::cout << " (Early termination)";
        std::cout << std::endl;
		return rounds;
	}

    void undirect_and_prune(GraphI &G, PR &Points, double alpha){
        parlay::sequence<parlay::sequence<edge>> to_group = parlay::tabulate(old_neighbors.size(), [&] (size_t i){
            size_t s = old_neighbors[i].size();
            assert(s == K);
            parlay::sequence<edge> e(s);
            for(indexType j=0; j<s; j++){
                e[j] = std::make_pair(old_neighbors[i][j].first, (int) i);
            }
            return e; 
        });
        auto undirected_graph = parlay::group_by_key_ordered(parlay::flatten(to_group));
        parlay::parallel_for(0, undirected_graph.size(), [&] (size_t i){
            indexType index = undirected_graph[i].first;
            auto filtered = parlay::remove_duplicates(undirected_graph[i].second);
            auto undirected_pids = parlay::tabulate(filtered.size(), [&] (size_t j){
                indexType indexU = filtered[j];
                distanceType dist = Points[index].distance(Points[indexU]);
                return std::make_pair(indexU, dist);
            });
            parlay::sort_inplace(undirected_pids, less);
            auto less3 = [&] (pid a, pid b) {return a.second < b.second;};
            auto merged_pids = seq_union(old_neighbors[index], undirected_pids, less3);
            old_neighbors[index] = merged_pids;
        });
        parlay::parallel_for(0, G.size(), [&] (size_t i){
            parlay::sequence<indexType> new_out = parlay::sequence<indexType>();
			for(const pid& j : old_neighbors[i]){
				if(new_out.size() == K) break;
				else if(new_out.size() == 0) new_out.push_back(j.first);
				else{
					distanceType dist_p = j.second;
					bool add = true;
					for(const indexType& k : new_out){
                        distanceType dist = Points[j.first].distance(Points[k]);
						if(dist_p > alpha*dist) {add = false; break;}
					}
					if(add) new_out.push_back(j.first);
				}
			}
            G[i].update_neighbors(new_out);
        });
    }


    void build_index(GraphI &G, PR &Points, long cluster_size, long num_clusters, double alpha){
		clusterPID<Point, PointRange, indexType> C;
        old_neighbors = parlay::sequence<parlay::sequence<pid>>(G.size());
		C.multiple_clustertrees(Points, cluster_size, num_clusters, K, old_neighbors);
		nn_descent_wrapper(Points);
		undirect_and_prune(G, Points, alpha);
	}
};

} // end namespace


================================================
FILE: algorithms/pyNNDescent/scripts/nytimes
================================================
# bash
BUILD_ARGS="-R 40 -cluster_size 100 -num_clusters 10 -alpha 1.2 -delta 0.05 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 16 -verbose"
TYPE_ARGS="-data_type float -dist_func mips -normalize -file_type bin"

PATH=data/nytimes-256-angular
DATA_FILE=$PATH/base.fbin
QUERY_FILE=$PATH/query.fbin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_pynn_40_100

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/pyNNDescent/scripts/sift
================================================
# bash
BUILD_ARGS="-R 40 -cluster_size 100 -num_clusters 10 -alpha 1.2 -delta 0.05 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 8 -verbose"
TYPE_ARGS="-data_type float -dist_func Euclidian -file_type bin"

PATH=data/sift-128-euclidean
DATA_FILE=$PATH/base.fbin
QUERY_FILE=$PATH/query.fbin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_pynn_40_100

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/tutorial.sh
================================================
#!/bin/bash

cd vamana
make
echo "Vamana:"
./neighbors -R 32 -L 64 -a 1.2 -graph_outfile ../../data/sift/sift_learn_32_64 -query_path ../../data/sift/sift_query.fvecs -gt_path ../../data/sift/sift-100K -res_path tutorial.csv -data_type float -file_type vec -dist_func Euclidian -base_path ../../data/sift/sift_learn.fvecs

echo "" 
echo "" 

cd ../HCNNG
make
echo "HCNNG:"
./neighbors -R 3 -L 10 -a 1000 -memory_flag 1 -graph_outfile ../../data/sift/sift_learn_3_10 -query_path ../../data/sift/sift_query.fvecs -gt_path ../../data/sift/sift-100K -res_path tutorial.csv -data_type float -file_type vec -dist_func Euclidian -base_path ../../data/sift/sift_learn.fvecs

echo ""
echo ""

cd ../pyNNDescent
make
echo "pyNNDescent:"
./neighbors -R 30 -L 100 -a 10 -d 1.2 -graph_outfile ../../data/sift/sift_learn_30 -query_path ../../data/sift/sift_query.fvecs -gt_path ../../data/sift/sift-100K -res_path tutorial.csv -data_type float -file_type vec -dist_func Euclidian -base_path ../../data/sift/sift_learn.fvecs

================================================
FILE: algorithms/utils/BUILD
================================================
# ANNS utilility.

package(default_visibility = ["//algorithms:__subpackages__"])


cc_library(
    name = "csvfile",
    hdrs = ["csvfile.h"],
)

cc_library(
    name = "beamSearch",
    hdrs = ["beamSearch.h"],
    deps = [
        "@parlaylib//parlay:io",
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
        "@parlaylib//parlay:random",
        ":graph",
        ":stats",
        ":types",
    ],
)

cc_library(
    name = "check_range_recall",
    hdrs = ["check_nn_recall.h"],
    deps = [
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
        ":beamSearch",
        ":csvfile",
        ":parse_results",
        ":stats",
        ":types",
    ],
)

cc_library(
    name = "euclidean_point",
    hdrs = ["euclidian_point.h"],
    deps = [
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
        "@parlaylib//parlay/internal:file_map",
        ":parse_results",
        ":types",
    ],
)

cc_library(
    name = "graph",
    hdrs = ["graph.h"],
    deps = [
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
        "@parlaylib//parlay/internal:file_map",
        ":parse_results",
        ":types",
    ],
)

cc_library(
    name = "mips_point",
    hdrs = ["mips_point.h"],
    deps = [
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
        "@parlaylib//parlay/internal:file_map",
        ":types",
    ],
)

cc_library(
    name = "mmap",
    hdrs = ["mmap.h"],
    deps = [
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
        "@parlaylib//parlay/internal:file_map",
    ],
)

cc_library(
    name = "parse_results",
    hdrs = ["parse_results.h"],
    deps = [
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
    ],
)

cc_library(
    name = "check_nn_recall",
    hdrs = ["check_nn_recall.h"],
    deps = [
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
        ":beamSearch",
        ":csvfile",
        ":parse_results",
        ":stats",
        ":types",
    ],
)

cc_library(
    name = "point_range",
    hdrs = ["point_range.h"],
    deps = [
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
        "@parlaylib//parlay/internal:file_map",
        ":types",
    ],
)

cc_library(
    name = "stats",
    hdrs = ["stats.h"],
    deps = [
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
        ":graph",
    ],
)

cc_library(
    name = "types",
    hdrs = ["types.h"],
    deps = [
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
        ":mmap",
    ],
)


cc_library(
    name = "union",
    hdrs = ["union.h"],
    deps = [
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
    ],
)

cc_library(
    name = "jl_point",
    hdrs = ["jl_point.h"],
    deps = [
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
        "@parlaylib//parlay/internal:file_map",
        ":types",
        ":mips_point",
    ],
)


================================================
FILE: algorithms/utils/NSGDist.h
================================================
//
// Created by 付聪 on 2017/6/21.
//

#ifndef EFANNA2E_DISTANCE_H
#define EFANNA2E_DISTANCE_H

#include <math.h>
#include <x86intrin.h>

#include <algorithm>
#include <iostream>
#include <type_traits>

#include "parlay/parallel.h"
#include "parlay/primitives.h"


namespace efanna2e {

// atomic_sum_counter<size_t> distance_calls;

enum Metric { L2 = 0, INNER_PRODUCT = 1, FAST_L2 = 2, PQ = 3 };
class Distance {
 public:
  virtual float compare(const float *a, const float *b,
                        unsigned length) const = 0;
  virtual ~Distance() {}
};

class DistanceL2 : public Distance {
 public:
  float compare(const float *a, const float *b, unsigned size) const {
    float result = 0;

#ifdef __GNUC__
#ifdef __AVX__

#define AVX_L2SQR(addr1, addr2, dest, tmp1, tmp2) \
  tmp1 = _mm256_loadu_ps(addr1);                  \
  tmp2 = _mm256_loadu_ps(addr2);                  \
  tmp1 = _mm256_sub_ps(tmp1, tmp2);               \
  tmp1 = _mm256_mul_ps(tmp1, tmp1);               \
  dest = _mm256_add_ps(dest, tmp1);

    __m256 sum;
    __m256 l0, l1;
    __m256 r0, r1;
    size_t qty16 = size >> 4;
    size_t aligned_size = qty16 << 4;
    const float *l = a;
    const float *r = b;

    float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};
    sum = _mm256_loadu_ps(unpack);

    for (unsigned i = 0; i < aligned_size; i += 16, l += 16, r += 16) {
      AVX_L2SQR(l, r, sum, l0, r0);
      AVX_L2SQR(l + 8, r + 8, sum, l1, r1);
    }
    _mm256_storeu_ps(unpack, sum);
    result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7];
    for (unsigned i = aligned_size; i < size; ++i, ++l, ++r) {
      float diff = *l - *r;
      result += diff * diff;
    }
    
    /*
#else
#ifdef __SSE2__
#define SSE_L2SQR(addr1, addr2, dest, tmp1, tmp2) \
        tmp1 = _mm_load_ps(addr1);\
        tmp2 = _mm_load_ps(addr2);\
        tmp1 = _mm_sub_ps(tmp1, tmp2); \
        tmp1 = _mm_mul_ps(tmp1, tmp1); \
        dest = _mm_add_ps(dest, tmp1);

__m128 sum;
__m128 l0, l1, l2, l3;
__m128 r0, r1, r2, r3;
unsigned D = (size + 3) & ~3U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = a;
const float *r = b;
const float *e_l = l + DD;
const float *e_r = r + DD;
float unpack[4] __attribute__ ((aligned (16))) = {0, 0, 0, 0};

sum = _mm_load_ps(unpack);
switch (DR) {
    case 12:
    SSE_L2SQR(e_l+8, e_r+8, sum, l2, r2);
    case 8:
    SSE_L2SQR(e_l+4, e_r+4, sum, l1, r1);
    case 4:
    SSE_L2SQR(e_l, e_r, sum, l0, r0);
  default:
    break;
}
for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
    SSE_L2SQR(l, r, sum, l0, r0);
    SSE_L2SQR(l + 4, r + 4, sum, l1, r1);
    SSE_L2SQR(l + 8, r + 8, sum, l2, r2);
    SSE_L2SQR(l + 12, r + 12, sum, l3, r3);
}
_mm_storeu_ps(unpack, sum);
result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
*/
// normal distance
#else

    float diff0, diff1, diff2, diff3;
    const float *last = a + size;
    const float *unroll_group = last - 3;

    /* Process 4 items with each loop for efficiency. */
    while (a < unroll_group) {
      diff0 = a[0] - b[0];
      diff1 = a[1] - b[1];
      diff2 = a[2] - b[2];
      diff3 = a[3] - b[3];
      result += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;
      a += 4;
      b += 4;
    }
    /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
    while (a < last) {
      diff0 = *a++ - *b++;
      result += diff0 * diff0;
    }
// #endif
#endif
#endif

    return result;
  }
};

class DistanceInnerProduct : public Distance {
 public:
  float compare(const float *a, const float *b, unsigned size) const {
    float result = 0;
#ifdef __GNUC__
#ifdef __AVX__
#define AVX_DOT(addr1, addr2, dest, tmp1, tmp2) \
  tmp1 = _mm256_loadu_ps(addr1);                \
  tmp2 = _mm256_loadu_ps(addr2);                \
  tmp1 = _mm256_mul_ps(tmp1, tmp2);             \
  dest = _mm256_add_ps(dest, tmp1);

    __m256 sum;
    __m256 l0, l1;
    __m256 r0, r1;
    unsigned D = (size + 7) & ~7U;
    unsigned DR = D % 16;
    unsigned DD = D - DR;
    const float *l = a;
    const float *r = b;
    const float *e_l = l + DD;
    const float *e_r = r + DD;
    float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};

    sum = _mm256_loadu_ps(unpack);
    if (DR) {
      AVX_DOT(e_l, e_r, sum, l0, r0);
    }

    for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
      AVX_DOT(l, r, sum, l0, r0);
      AVX_DOT(l + 8, r + 8, sum, l1, r1);
    }
    _mm256_storeu_ps(unpack, sum);
    result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] +
             unpack[5] + unpack[6] + unpack[7];
/*
#else
#ifdef __SSE2__
      #define SSE_DOT(addr1, addr2, dest, tmp1, tmp2) \
          tmp1 = _mm128_loadu_ps(addr1);\
          tmp2 = _mm128_loadu_ps(addr2);\
          tmp1 = _mm128_mul_ps(tmp1, tmp2); \
          dest = _mm128_add_ps(dest, tmp1);
      __m128 sum;
      __m128 l0, l1, l2, l3;
      __m128 r0, r1, r2, r3;
      unsigned D = (size + 3) & ~3U;
      unsigned DR = D % 16;
      unsigned DD = D - DR;
      const float *l = a;
      const float *r = b;
      const float *e_l = l + DD;
      const float *e_r = r + DD;
      float unpack[4] __attribute__ ((aligned (16))) = {0, 0, 0, 0};

      sum = _mm_load_ps(unpack);
      switch (DR) {
          case 12:
          SSE_DOT(e_l+8, e_r+8, sum, l2, r2);
          case 8:
          SSE_DOT(e_l+4, e_r+4, sum, l1, r1);
          case 4:
          SSE_DOT(e_l, e_r, sum, l0, r0);
        default:
          break;
      }
      for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
          SSE_DOT(l, r, sum, l0, r0);
          SSE_DOT(l + 4, r + 4, sum, l1, r1);
          SSE_DOT(l + 8, r + 8, sum, l2, r2);
          SSE_DOT(l + 12, r + 12, sum, l3, r3);
      }
      _mm_storeu_ps(unpack, sum);
      result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
*/
#else

    float dot0, dot1, dot2, dot3;
    const float *last = a + size;
    const float *unroll_group = last - 3;

    /* Process 4 items with each loop for efficiency. */
    while (a < unroll_group) {
      dot0 = a[0] * b[0];
      dot1 = a[1] * b[1];
      dot2 = a[2] * b[2];
      dot3 = a[3] * b[3];
      result += dot0 + dot1 + dot2 + dot3;
      a += 4;
      b += 4;
    }
    /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
    while (a < last) {
      result += *a++ * *b++;
    }
// #endif
#endif
#endif
    return result;
  }
};
class DistanceFastL2 : public DistanceInnerProduct {
 public:
  float norm(const float *a, unsigned size) const {
    float result = 0;
#ifdef __GNUC__
#ifdef __AVX__
#define AVX_L2NORM(addr, dest, tmp) \
  tmp = _mm256_loadu_ps(addr);      \
  tmp = _mm256_mul_ps(tmp, tmp);    \
  dest = _mm256_add_ps(dest, tmp);

    __m256 sum;
    __m256 l0, l1;
    unsigned D = (size + 7) & ~7U;
    unsigned DR = D % 16;
    unsigned DD = D - DR;
    const float *l = a;
    const float *e_l = l + DD;
    float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};

    sum = _mm256_loadu_ps(unpack);
    if (DR) {
      AVX_L2NORM(e_l, sum, l0);
    }
    for (unsigned i = 0; i < DD; i += 16, l += 16) {
      AVX_L2NORM(l, sum, l0);
      AVX_L2NORM(l + 8, sum, l1);
    }
    _mm256_storeu_ps(unpack, sum);
    result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] +
             unpack[5] + unpack[6] + unpack[7];
/*
#else
#ifdef __SSE2__
#define SSE_L2NORM(addr, dest, tmp) \
    tmp = _mm128_loadu_ps(addr); \
    tmp = _mm128_mul_ps(tmp, tmp); \
    dest = _mm128_add_ps(dest, tmp);

    __m128 sum;
    __m128 l0, l1, l2, l3;
    unsigned D = (size + 3) & ~3U;
    unsigned DR = D % 16;
    unsigned DD = D - DR;
    const float *l = a;
    const float *e_l = l + DD;
    float unpack[4] __attribute__ ((aligned (16))) = {0, 0, 0, 0};

    sum = _mm_load_ps(unpack);
    switch (DR) {
        case 12:
        SSE_L2NORM(e_l+8, sum, l2);
        case 8:
        SSE_L2NORM(e_l+4, sum, l1);
        case 4:
        SSE_L2NORM(e_l, sum, l0);
      default:
        break;
    }
    for (unsigned i = 0; i < DD; i += 16, l += 16) {
        SSE_L2NORM(l, sum, l0);
        SSE_L2NORM(l + 4, sum, l1);
        SSE_L2NORM(l + 8, sum, l2);
        SSE_L2NORM(l + 12, sum, l3);
    }
    _mm_storeu_ps(unpack, sum);
    result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
*/
#else
    float dot0, dot1, dot2, dot3;
    const float *last = a + size;
    const float *unroll_group = last - 3;

    /* Process 4 items with each loop for efficiency. */
    while (a < unroll_group) {
      dot0 = a[0] * a[0];
      dot1 = a[1] * a[1];
      dot2 = a[2] * a[2];
      dot3 = a[3] * a[3];
      result += dot0 + dot1 + dot2 + dot3;
      a += 4;
    }
    /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
    while (a < last) {
      result += (*a) * (*a);
      a++;
    }
// #endif
#endif
#endif
    return result;
  }
  using DistanceInnerProduct::compare;
  float compare(const float *a, const float *b, float norm,
                unsigned size) const {  // not implement
    float result = -2 * DistanceInnerProduct::compare(a, b, size);
    result += norm;
    return result;
  }
};
}  // namespace efanna2e


#endif  // EFANNA2E_DISTANCE_H


================================================
FILE: algorithms/utils/beamSearch.h
================================================
#ifndef ALGORITHMS_ANN_BEAM_SEARCH_H_
#define ALGORITHMS_ANN_BEAM_SEARCH_H_

#include <algorithm>
#include <functional>
#include <limits>
#include <random>
#include <set>
#include <unordered_set>
#include <queue>

#include "parlay/io.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/random.h"
#include "types.h"
#include "graph.h"
#include "stats.h"
#include "hashset.h"

namespace parlayANN {

  struct EarlyStopping {
  template<typename PointInfo>
  bool operator () (const PointInfo& frontier, 
                    const PointInfo& unvisited_frontier,
                    const PointInfo& visited,
                    const QueryParams& QP) { return false;}
  };

  
  // main beam search
template<typename indexType, typename Point, typename PointRange,
         typename QPoint, typename QPointRange, typename GT, typename ES = EarlyStopping>
std::pair<std::pair<parlay::sequence<std::pair<indexType, typename Point::distanceType>>,
                    parlay::sequence<std::pair<indexType, typename Point::distanceType>>>,
          size_t>
filtered_beam_search(const GT &G,
                     const Point p,  const PointRange &Points,
                     const QPoint qp, const QPointRange &Q_Points,
                     const parlay::sequence<indexType> starting_points,
                     const QueryParams &QP,
                     bool use_filtering = false,
                     ES early_stop = ES{}
                     ) {
  using dtype = typename Point::distanceType;
  using id_dist = std::pair<indexType, dtype>;
  int beamSize = QP.beamSize;
  int max_degree = QP.degree_limit;

  if (starting_points.size() == 0) {
    std::cout << "beam search expects at least one start point" << std::endl;
    abort();
  } else if (starting_points.size() > beamSize) {
    std::cout << "beam search has more starting points than beam size" << std::endl;
    abort();
  }

  // compare two (node_id,distance) pairs, first by distance and then id if
  // equal
  using distanceType = typename Point::distanceType;
  auto less = [&](id_dist a, id_dist b) {
    return a.second < b.second || (a.second == b.second && a.first < b.first);
  };

  hashset<indexType> has_been_seen(2 * (10 + beamSize) * max_degree);
  
  // Frontier maintains the closest points found so far and its size
  // is always at most beamSize.  Each entry is a (id,distance) pair.
  // Initialized with starting points and kept sorted by distance.
  std::vector<id_dist> frontier;
  frontier.reserve(beamSize);
  for (auto q : starting_points) {
    frontier.push_back(id_dist(q, Points[q].distance(p)));
    has_been_seen(q);
  }
  std::sort(frontier.begin(), frontier.end(), less);

  // The subset of the frontier that has not been visited
  // Use the first of these to pick next vertex to visit.
  std::vector<id_dist> unvisited_frontier(beamSize);
  for (int i=0; i < frontier.size(); i++)
    unvisited_frontier[i] = frontier[i];

  // maintains sorted set of visited vertices (id-distance pairs)
  std::vector<id_dist> visited;
  visited.reserve(2 * beamSize);

  // counters
  size_t dist_cmps = starting_points.size();
  size_t full_dist_cmps = starting_points.size();
  int remain = frontier.size();
  int num_visited = 0;

  // used as temporaries in the loop
  std::vector<id_dist> new_frontier(2 * std::max<size_t>(beamSize,starting_points.size()) +
                                    G.max_degree());
  std::vector<id_dist> candidates;
  candidates.reserve(G.max_degree() + beamSize);
  std::vector<indexType> filtered;
  filtered.reserve(G.max_degree());
  std::vector<indexType> pruned;
  pruned.reserve(G.max_degree());

  dtype filter_threshold_sum = 0.0;
  int filter_threshold_count = 0;
  dtype filter_threshold;
  indexType filter_id;
  indexType filter_tail_mean = 0;

  // offset into the unvisited_frontier vector (unvisited_frontier[offset] is the next to visit)
  int offset = 0;

  // The main loop.  Terminate beam search when the entire frontier
  // has been visited or have reached max_visit.
  while (remain > offset && num_visited < QP.limit) {
    
    // the next node to visit is the unvisited frontier node that is closest to p
    id_dist current = unvisited_frontier[offset];
    if (early_stop(frontier, unvisited_frontier, visited, QP))
      break;
    
    G[current.first].prefetch();
    // add to visited set
    auto position = std::upper_bound(visited.begin(), visited.end(), current, less);
    visited.insert(position, current);
    num_visited++;
    bool frontier_full = frontier.size() == beamSize;

    // if using filtering based on lower quality distances measure, then maintain the average
    // of low quality distance to the last point in the frontier (if frontier is full)
    if (use_filtering && frontier_full) {
      //constexpr int width = 5;
      int width = frontier.size();
      indexType id = frontier.back().first;
      if (filter_threshold_count == 0 || filter_id != id) {
        filter_tail_mean = 0.0;
        for (int i = frontier.size() - width; i < frontier.size(); i ++) 
          filter_tail_mean += Q_Points[frontier[i].first].distance(qp);
        filter_tail_mean /= width;
        filter_id = id;
      }
      filter_threshold_sum += filter_tail_mean;
      filter_threshold_count++;
      filter_threshold = filter_threshold_sum / filter_threshold_count;
    }

    // keep neighbors that have not been visited (using approximate
    // hash). Note that if a visited node is accidentally kept due to
    // approximate hash it will be removed below by the union.
    pruned.clear();
    filtered.clear();
    long num_elts = std::min<long>(G[current.first].size(), QP.degree_limit);
    for (indexType i=0; i<num_elts; i++) {
      auto a = G[current.first][i];
      if (has_been_seen(a) || Points[a].same_as(p)) continue;  // skip if already seen
      Q_Points[a].prefetch();
      pruned.push_back(a);
    }
    dist_cmps += pruned.size();

    // filter using low-quality distance
    if (use_filtering && frontier_full) {
      for (auto a : pruned) {
        if (frontier_full && Q_Points[a].distance(qp) >= filter_threshold) continue;
        filtered.push_back(a);
        Points[a].prefetch();
      }
    } else std::swap(filtered, pruned);

    // Further remove if distance is greater than current
    // furthest distance in current frontier (if full).
    distanceType cutoff = (frontier_full
                           ? frontier[frontier.size() - 1].second
                           : (distanceType)std::numeric_limits<distanceType>::max());
    for (auto a : filtered) {
      distanceType dist = Points[a].distance(p);
      full_dist_cmps++;
      // skip if frontier not full and distance too large
      if (dist >= cutoff) continue;
      candidates.push_back(std::pair{a, dist});
    }
    // If candidates insufficently full then skip rest of step until sufficiently full.
    // This iproves performance for higher accuracies (e.g. beam sizes of 100+)
    if (candidates.size() == 0 || 
        (QP.limit >= 2 * beamSize &&
         //candidates.size() < beamSize/8 &&
         candidates.size() < QP.batch_factor * beamSize &&
         offset + 1 < remain)) {
      offset++;
      continue;
    }
    offset = 0;

    // sort the candidates by distance from p,
    // and remove any duplicates (to be robust for neighbor lists with duplicates)
    std::sort(candidates.begin(), candidates.end(), less);
    auto candidates_end = std::unique(candidates.begin(), candidates.end(),
                                      [] (auto a, auto b) {return a.first == b.first;});

    // union the frontier and candidates into new_frontier, both are sorted
    auto new_frontier_size =
      std::set_union(frontier.begin(), frontier.end(), candidates.begin(),
                     candidates_end, new_frontier.begin(), less) -
      new_frontier.begin();
    candidates.clear();
    
    // trim to at most beam size
    new_frontier_size = std::min<size_t>(beamSize, new_frontier_size);

    // copy new_frontier back to the frontier
    frontier.clear();
    for (indexType i = 0; i < new_frontier_size; i++)
      frontier.push_back(new_frontier[i]);

    // get the unvisited frontier
    remain = (std::set_difference(frontier.begin(),
                                  frontier.begin() + std::min<long>(frontier.size(), QP.beamSize),
                                  visited.begin(),
                                  visited.end(),
                                  unvisited_frontier.begin(), less) -
              unvisited_frontier.begin());
  }

  return std::make_pair(std::make_pair(parlay::to_sequence(frontier),
                                       parlay::to_sequence(visited)),
                        full_dist_cmps);
}

  // alternative experimental version
  // about equal performance
template<typename indexType, typename Point, typename PointRange,
         typename QPoint, typename QPointRange, typename GT, typename ES = EarlyStopping>
std::pair<std::pair<parlay::sequence<std::pair<indexType, typename Point::distanceType>>,
                    parlay::sequence<std::pair<indexType, typename Point::distanceType>>>,
          size_t>
filtered_beam_search_new(const GT &G,
                      const Point p,  const PointRange &Points,
                      const QPoint qp, const QPointRange &Q_Points,
                      const parlay::sequence<indexType> starting_points,
                      const QueryParams &QP,
                      bool use_filtering = false,
                      ES early_stop = ES{}
                      ) {
  using dtype = typename Point::distanceType;
  using id_dist = std::pair<indexType, dtype>;
  int beamSize = QP.beamSize;
  int max_degree = QP.degree_limit;

  if (starting_points.size() == 0) {
    std::cout << "beam search expects at least one start point" << std::endl;
    abort();
  } else if (starting_points.size() > beamSize) {
    std::cout << "beam search has more starting points than beam size" << std::endl;
    abort();
  }

  // compare two (node_id,distance) pairs, first by distance and then id if
  // equal
  using distanceType = typename Point::distanceType;
  auto less = [&](id_dist a, id_dist b) {
    return a.second < b.second || (a.second == b.second && a.first < b.first);
  };

  long set_size = 1.5 * (10 + beamSize) * max_degree;
  hashset<indexType> has_been_seen(set_size);
  
  // Frontier maintains the closest points found so far and its size
  // is always at most beamSize.  Each entry is a (id,distance) pair.
  // Initialized with starting points and kept sorted by distance.
  std::vector<id_dist> frontier;
  frontier.reserve(2*beamSize);
  for (auto q : starting_points) {
    frontier.push_back(id_dist(q, Points[q].distance(p)));
    has_been_seen(q);
  }
  std::sort(frontier.begin(), frontier.end(), less);
  std::vector<id_dist> new_frontier;
  
  // maintains sorted set of visited vertices (id-distance pairs)
  std::vector<id_dist> visited;
  visited.reserve(2 * beamSize);

  // counters
  size_t dist_cmps = starting_points.size();
  size_t full_dist_cmps = starting_points.size();
  int num_visited = 0;

  // used as temporaries in the loop
  std::vector<id_dist> candidates;
  candidates.reserve(G.max_degree() + beamSize);
  std::vector<indexType> filtered;
  filtered.reserve(G.max_degree());
  std::vector<indexType> pruned;
  pruned.reserve(G.max_degree());

  // offset into the unvisited_frontier vector (unvisited_frontier[offset] is the next to visit)
  int offset = 0;
  std::priority_queue<dtype> topQ;
  for (auto [v,d] : frontier)
    topQ.push(d);
  std::priority_queue<dtype> visitedQ;

  float filter_threshold = 0.0;
  int filter_threshold_cnt = 0;
  float round_sum = 0.0;

  // The main loop.  Terminate beam search when the entire frontier
  // has been visited or have reached max_visit.
  while (frontier.size() > 0 && num_visited < QP.limit) {
    // the next node to visit is the unvisited frontier node that is closest to p
    id_dist current = frontier[offset];
    if (visitedQ.size() == beamSize && visitedQ.top() <= current.second) break;
    visited.push_back(current);
    visitedQ.push(current.second);
    if (visitedQ.size() > beamSize)
      visitedQ.pop();

    //if (early_stop(frontier, unvisited_frontier, visited, QP))
    //  break;
    
    G[current.first].prefetch();
    num_visited++;
    bool has_full_beam = (topQ.size() >= beamSize);

    pruned.clear();
    filtered.clear();
    long num_elts = std::min<long>(G[current.first].size(), QP.degree_limit);
    for (indexType i=0; i<num_elts; i++) {
      auto a = G[current.first][i];
      if (has_been_seen(a) || Points[a].same_as(p)) continue;  // skip if already seen
      Q_Points[a].prefetch();
      pruned.push_back(a);
    }
    dist_cmps += pruned.size();
    
    // filter using low-quality distance
    if (use_filtering && has_full_beam) {
      for (auto a : pruned) {
        if (Q_Points[a].distance(qp) >= filter_threshold) continue;
        filtered.push_back(a);
        Points[a].prefetch();
      }
    } else std::swap(filtered, pruned);
    
    // Further remove if distance is greater than current
    // furthest distance in current frontier (if full).
    for (auto a : filtered) {
      distanceType dist = Points[a].distance(p);
      full_dist_cmps++;
      // skip if frontier not full and distance too large
      if (topQ.size() == beamSize && topQ.top() <= dist)
        continue;
      topQ.push(dist);
      if (topQ.size() > beamSize) topQ.pop();
      if (use_filtering)
        round_sum += Q_Points[a].distance(qp);
      candidates.push_back(std::pair{a, dist});
    }

    offset++;

    // If candidates insufficently full then skip rest of step until sufficiently full.
    // This iproves performance for higher accuracies (e.g. beam sizes of 100+)
    if (offset != frontier.size() &&
        (candidates.size() == 0 || 
         (QP.limit >= 2 * beamSize &&
          candidates.size() < QP.batch_factor * beamSize)) &&
        (visitedQ.size() != beamSize ||
         visitedQ.top() > frontier[offset].second))
      continue;

    if (use_filtering) {
      float round_average = round_sum/candidates.size();
      // We use a rolling average to keep the filter_threshold smooth
      // and always a bit bigger than distances we have seen recently
      // so we don't filter out too many points.
      if (filter_threshold_cnt == 0)
        filter_threshold = round_average;
      else filter_threshold = (filter_threshold * .85 + round_average * .15);
      round_sum = 0;
      filter_threshold_cnt++;
    }
    
    // sort the candidates by distance from p,
    std::sort(candidates.begin(), candidates.end(), less);
    
    // merge the frontier and candidates into new_frontier, both are sorted
    long merge_size = frontier.size() - offset + candidates.size();
        
    new_frontier.resize(merge_size);
    std::merge(frontier.begin()+offset, frontier.end(), candidates.begin(),
               candidates.end(), new_frontier.begin(), less);
    if (merge_size > beamSize) 
      new_frontier.resize(beamSize);
    candidates.clear();
    std::swap(frontier, new_frontier);
    offset = 0;
  }

  // sort all visited points and take the first beamSize of them
  std::sort(visited.begin(), visited.end(), less);
  if (visited.size() > beamSize)
    visited.resize(beamSize);

  return std::make_pair(std::make_pair(parlay::to_sequence(visited),
                                       parlay::to_sequence(visited)),
                        full_dist_cmps);
}

  struct EStop {
    template<typename PointInfo>
    bool operator () (const PointInfo& frontier, 
                      const PointInfo& unvisited_frontier,
                      const PointInfo& visited,
                      const QueryParams& QP) { return false;}
  };
  
// version without filtering
  template<typename Point, typename PointRange, typename indexType> // = EarlyStopping>
std::pair<std::pair<parlay::sequence<std::pair<indexType, typename Point::distanceType>>,
                    parlay::sequence<std::pair<indexType, typename Point::distanceType>>>, size_t>
beam_search(const Point p, const Graph<indexType> &G, const PointRange &Points,
            const parlay::sequence<indexType> starting_points, const QueryParams &QP
            ) {
    return filtered_beam_search(G,p, Points, p, Points, starting_points, QP, false); //early_stop);
}

// backward compatibility (for hnsw)
template<typename indexType, typename Point, typename PointRange, class GT>
std::pair<std::pair<parlay::sequence<std::pair<indexType, typename Point::distanceType>>, parlay::sequence<std::pair<indexType, typename Point::distanceType>>>, size_t>
beam_search_impl(Point p, GT &G, PointRange &Points,
                 parlay::sequence<indexType> starting_points, QueryParams &QP) {
  return filtered_beam_search(G, p, Points, p, Points, starting_points, QP, false);
}

// pass single start point
template<typename Point, typename PointRange, typename indexType>
std::pair<std::pair<parlay::sequence<std::pair<indexType, typename Point::distanceType>>,
                    parlay::sequence<std::pair<indexType, typename Point::distanceType>>>, indexType>
beam_search(const Point p, const Graph<indexType> &G, const PointRange &Points,
            const indexType starting_point, const QueryParams &QP) {
  parlay::sequence<indexType> start_points = {starting_point};
  return beam_search(p, G, Points, start_points, QP);
}

// searches every element in q starting from a randomly selected point
template<typename PointRange, typename indexType>
parlay::sequence<parlay::sequence<indexType>>
beamSearchRandom(const PointRange& Query_Points,
                 const Graph<indexType> &G,
                 const PointRange &Base_Points,
                 stats<indexType> &QueryStats,
                 const QueryParams &QP) {
  using Point = typename PointRange::Point;
  if (QP.k > QP.beamSize) {
    std::cout << "Error: beam search parameter Q = " << QP.beamSize
              << " same size or smaller than k = " << QP.k << std::endl;
    abort();
  }
  // use a random shuffle to generate random starting points for each query
  size_t n = G.size();

  parlay::sequence<parlay::sequence<indexType>> all_neighbors(Query_Points.size());

  parlay::random_generator gen;
  std::uniform_int_distribution<long> dis(0, n - 1);
  auto indices = parlay::tabulate(Query_Points.size(), [&](size_t i) {
    auto r = gen[i];
    return dis(r);
  });

  parlay::parallel_for(0, Query_Points.size(), [&](size_t i) {
    parlay::sequence<indexType> neighbors = parlay::sequence<indexType>(QP.k);
    indexType start = indices[i];
    parlay::sequence<std::pair<indexType, typename Point::distanceType>> beamElts;
    parlay::sequence<std::pair<indexType, typename Point::distanceType>> visitedElts;
    auto [pairElts, dist_cmps] =
      beam_search(Query_Points[i], G, Base_Points, start, QP);
    beamElts = pairElts.first;
    visitedElts = pairElts.second;
    for (indexType j = 0; j < QP.k; j++) {
      neighbors[j] = beamElts[j].first;
    }
    all_neighbors[i] = neighbors;
    QueryStats.increment_visited(i, visitedElts.size());
    QueryStats.increment_dist(i, dist_cmps);
  });
  return all_neighbors;
}

template<typename PointRange, typename indexType>
parlay::sequence<parlay::sequence<indexType>>
searchAll(PointRange& Query_Points,
          Graph<indexType> &G, PointRange &Base_Points, stats<indexType> &QueryStats,
          indexType starting_point, QueryParams &QP) {
  parlay::sequence<indexType> start_points = {starting_point};
  return searchAll<PointRange, indexType>(Query_Points, G, Base_Points, QueryStats, start_points, QP);
}

template< typename PointRange, typename indexType>
parlay::sequence<parlay::sequence<indexType>>
searchAll(PointRange &Query_Points,
          Graph<indexType> &G, PointRange &Base_Points, stats<indexType> &QueryStats,
          parlay::sequence<indexType> starting_points,
          QueryParams &QP) {
  if (QP.k > QP.beamSize) {
    std::cout << "Error: beam search parameter Q = " << QP.beamSize
              << " same size or smaller than k = " << QP.k << std::endl;
    abort();
  }
  parlay::sequence<parlay::sequence<indexType>> all_neighbors(Query_Points.size());
  parlay::parallel_for(0, Query_Points.size(), [&](size_t i) {
    parlay::sequence<indexType> neighbors = parlay::sequence<indexType>(QP.k);
    auto [pairElts, dist_cmps] = beam_search(Query_Points[i], G, Base_Points, starting_points, QP);
    auto [beamElts, visitedElts] = pairElts;
    for (indexType j = 0; j < QP.k; j++) {
      neighbors[j] = beamElts[j].first;
    }
    all_neighbors[i] = neighbors;
    QueryStats.increment_visited(i, visitedElts.size());
    QueryStats.increment_dist(i, dist_cmps);
  });

  return all_neighbors;
}

// Returns a sequence of nearest neighbors each with their distance
template<typename Point, typename QPoint, typename QQPoint,
         typename PointRange, typename QPointRange, typename QQPointRange,
         typename indexType>
parlay::sequence<std::pair<indexType, typename Point::distanceType>>
beam_search_rerank(const Point &p,
                   const QPoint &qp,
                   const QQPoint &qqp,
                   const Graph<indexType> &G,
                   const PointRange &Base_Points,
                   const QPointRange &Q_Base_Points,
                   const QQPointRange &QQ_Base_Points,
                   stats<indexType> &QueryStats,
                   const parlay::sequence<indexType> starting_points,
                   const QueryParams &QP,
                   bool stats = true) {
  using dtype = typename Point::distanceType;
  using id_dist = std::pair<indexType, dtype>;
  auto QPP = QP;

  bool use_rerank = (Base_Points.params.num_bytes() != Q_Base_Points.params.num_bytes());
  bool use_filtering = (Q_Base_Points.params.num_bytes() != QQ_Base_Points.params.num_bytes());
  std::pair<std::pair<parlay::sequence<id_dist>, parlay::sequence<id_dist>>, size_t> r;
  r = filtered_beam_search(G,
                            qp, Q_Base_Points,
                            qqp, QQ_Base_Points,
                            starting_points, QPP, use_filtering);
  auto [pairElts, dist_cmps] = r;
  auto [beamElts, visitedElts] = pairElts;
  if (beamElts.size() < QP.k) {
    std::cout << "Error: for point id " << p.id()
              << " beam search returned " << beamElts.size()
              << " elements, which is less than k = " << QP.k << std::endl;
    abort();
  }
  
  if (stats) {
    QueryStats.increment_visited(p.id(), visitedElts.size());
    QueryStats.increment_dist(p.id(), dist_cmps);
  }

  if (use_rerank) {
    // recalculate distances with non-quantized points and sort
    int num_check = std::min<int>(QP.k * QP.rerank_factor, beamElts.size());
    std::vector<id_dist> pts;
    for (int i=0; i < num_check; i++) {
      int j = beamElts[i].first;
      pts.push_back(id_dist(j, p.distance(Base_Points[j])));
    }
    auto less = [&] (id_dist a, id_dist b) {
      return a.second < b.second || (a.second == b.second && a.first < b.first);
    };
    std::sort(pts.begin(), pts.end(), less);

    // keep first k
    parlay::sequence<id_dist> results;
    for (int i= 0; i < QP.k; i++)
      results.push_back(pts[i]);

    return results;
  } else {
    //return beamElts;
    parlay::sequence<id_dist> results;
    for (int i= 0; i < QP.k; i++) {
      int j = beamElts[i].first;
      results.push_back(id_dist(j, p.distance(Base_Points[j])));
    }
    return results;
  }
}

template<typename PointRange, typename QPointRange, typename QQPointRange, typename indexType>
parlay::sequence<parlay::sequence<indexType>>
qsearchAll(const PointRange &Query_Points,
           const QPointRange &Q_Query_Points,
           const QQPointRange &QQ_Query_Points,
           const Graph<indexType> &G,
           const PointRange &Base_Points,
           const QPointRange &Q_Base_Points,
           const QQPointRange &QQ_Base_Points,
           stats<indexType> &QueryStats,
           const indexType starting_point,
           const QueryParams &QP,
           bool random = false) {
  if (QP.k > QP.beamSize) {
    std::cout << "Error: beam search parameter Q = " << QP.beamSize
              << " same size or smaller than k = " << QP.k << std::endl;
    abort();
  }
  parlay::sequence<parlay::sequence<indexType>> all_neighbors(Query_Points.size());
  if (random) {
    parlay::random_generator gen;
    std::uniform_int_distribution<long> dis(0, G.size() - 1);
    auto indices = parlay::tabulate(Query_Points.size(), [&](size_t i) -> indexType {
      auto r = gen[i];
      return dis(r);
    });

    parlay::parallel_for(0, Query_Points.size(), [&](size_t i) {
      parlay::sequence<indexType> starting_points = {indices[i]};
      auto ngh_dist = beam_search_rerank(Query_Points[i], Q_Query_Points[i], QQ_Query_Points[i],
                                         G,
                                         Base_Points, Q_Base_Points, QQ_Base_Points,
                                         QueryStats, starting_points, QP);
      all_neighbors[i] = parlay::map(ngh_dist, [] (auto& p) {return p.first;});
    });
  } else {
    parlay::sequence<indexType> starting_points = {starting_point};
    parlay::parallel_for(0, Query_Points.size(), [&](size_t i) {
      auto ngh_dist = beam_search_rerank(Query_Points[i], Q_Query_Points[i], QQ_Query_Points[i],
                                         G,
                                         Base_Points, Q_Base_Points, QQ_Base_Points,
                                         QueryStats, starting_points, QP);
      all_neighbors[i] = parlay::map(ngh_dist, [] (auto& p) {return p.first;});
    });
  }

  return all_neighbors;
}


} // end namespace

#endif // ALGORITHMS_ANN_BEAM_SEARCH_H_


================================================
FILE: algorithms/utils/check_nn_recall.h
================================================
#ifndef ALGORITHMS_CHECK_NN_RECALL_H_
#define ALGORITHMS_CHECK_NN_RECALL_H_

#include <algorithm>
#include <set>

#include "beamSearch.h"
#include "csvfile.h"
#include "parse_results.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "types.h"
#include "stats.h"

namespace parlayANN {

template<typename PointRange, typename QPointRange, typename QQPointRange, typename indexType>
nn_result checkRecall(const Graph<indexType> &G,
                      const PointRange &Base_Points,
                      const PointRange &Query_Points,
                      const QPointRange &Q_Base_Points,
                      const QPointRange &Q_Query_Points,
                      const QQPointRange &QQ_Base_Points,
                      const QQPointRange &QQ_Query_Points,
                      const groundTruth<indexType> &GT,
                      const bool random,
                      const long start_point,
                      const long k,
                      const QueryParams &QP,
                      const bool verbose) {
  using Point = typename PointRange::Point;

  if (GT.size() > 0 && k > GT.dimension()) {
    std::cout << k << "@" << k << " too large for ground truth data of size "
              << GT.dimension() << std::endl;
    abort();
  }

  parlay::sequence<parlay::sequence<indexType>> all_ngh;

  parlay::internal::timer t;
  float query_time;
  stats<indexType> QueryStats(Query_Points.size());
  QueryStats.clear();
  // to help clear the cache between runs
  auto volatile xx = parlay::random_permutation<long>(5000000);
  t.next_time();
  if (random) {
    all_ngh = qsearchAll<PointRange, QPointRange, QQPointRange, indexType>(Query_Points, Q_Query_Points, QQ_Query_Points,
                                                                           G,
                                                                           Base_Points, Q_Base_Points, QQ_Base_Points,
                                                                           QueryStats, start_point, QP, /*random=*/true);
  } else {
    all_ngh = qsearchAll<PointRange, QPointRange, QQPointRange, indexType>(Query_Points, Q_Query_Points, QQ_Query_Points,
                                                                           G,
                                                                           Base_Points, Q_Base_Points, QQ_Base_Points,
                                                                           QueryStats, start_point, QP);
  }
  query_time = t.next_time();
  
  float recall = 0.0;
  //TODO deprecate this after further testing
  bool dists_present = true;
  if (GT.size() > 0 && !dists_present) {
    size_t n = Query_Points.size();
    int numCorrect = 0;
    for (indexType i = 0; i < n; i++) {
      std::set<indexType> reported_nbhs;
      if (all_ngh[i].size() != k) {
        std::cout << "bad number of neighbors reported: " << all_ngh[i].size() << std::endl;
        abort();
      }
      for (indexType l = 0; l < k; l++) reported_nbhs.insert((all_ngh[i])[l]);
      if (reported_nbhs.size() != k) {
        std::cout << "duplicate entries in reported neighbors" << std::endl;
        abort();
      }
      for (indexType l = 0; l < k; l++) {
        if (reported_nbhs.find((GT.coordinates(i,l))) !=
            reported_nbhs.end()) {
          numCorrect += 1;
        }
      }
    }
    recall = static_cast<float>(numCorrect) / static_cast<float>(k * n);
  } else if (GT.size() > 0 && dists_present) {
    size_t n = Query_Points.size();

    int numCorrect = 0;
    for (indexType i = 0; i < n; i++) {
      parlay::sequence<int> results_with_ties;
      for (indexType l = 0; l < k; l++)
        results_with_ties.push_back(GT.coordinates(i,l));
      Point qp = Query_Points[i];
      float last_dist = qp.distance(Base_Points[GT.coordinates(i, k-1)]);
      //float last_dist = GT.distances(i, k-1);
      for (indexType l = k; l < GT.dimension(); l++) {
        //if (GT.distances(i,l) == last_dist) {
        if (qp.distance(Base_Points[GT.coordinates(i, l)]) == last_dist) {
          results_with_ties.push_back(GT.coordinates(i,l));
        }
      }
      std::set<int> reported_nbhs;
      for (indexType l = 0; l < k; l++) reported_nbhs.insert((all_ngh[i])[l]);
      for (indexType l = 0; l < results_with_ties.size(); l++) {
        if (reported_nbhs.find(results_with_ties[l]) != reported_nbhs.end()) {
          numCorrect += 1;
        }
      }
    }
    recall = static_cast<float>(numCorrect) / static_cast<float>(k * n);
  }
  float QPS = Query_Points.size() / query_time;
  if (verbose)
    std::cout << "search: Q=" << QP.beamSize << ", k=" << QP.k
              << ", limit=" << QP.limit
      //<< ", dlimit=" << QP.degree_limit
              << ", recall=" << recall
              << ", visited=" << QueryStats.visited_stats()[0]
              << ", comparisons=" << QueryStats.dist_stats()[0]
              << ", QPS=" << QPS
              << ", ctime=" << 1/(QPS*QueryStats.dist_stats()[0]) * 1e9 << std::endl;

  auto stats_ = {QueryStats.dist_stats(), QueryStats.visited_stats()};
  parlay::sequence<indexType> stats = parlay::flatten(stats_);
  nn_result N(recall, stats, QPS, k, QP.beamSize, Query_Points.size(), QP.limit, QP.degree_limit, k);
  return N;
}

void write_to_csv(std::string csv_filename, parlay::sequence<float> buckets,
                  parlay::sequence<nn_result> results, Graph_ G) {
  csvfile csv(csv_filename);
  csv << "GRAPH"
      << "Parameters"
      << "Size"
      << "Build time"
      << "Avg degree"
      << "Max degree" << endrow;
  csv << G.name << G.params << G.size << G.time << G.avg_deg << G.max_deg
      << endrow;
  csv << endrow;
  csv << "Num queries"
      << "Target recall"
      << "Actual recall"
      << "QPS"
      << "Average Cmps"
      << "Tail Cmps"
      << "Average Visited"
      << "Tail Visited"
      << "k"
      << "Q"
      << endrow;
  for (int i = 0; i < results.size(); i++) {
    nn_result N = results[i];
    csv << N.num_queries << buckets[i] << N.recall << N.QPS << N.avg_cmps
        << N.tail_cmps << N.avg_visited << N.tail_visited << N.k << N.beamQ
        << endrow;
  }
  csv << endrow;
  csv << endrow;
}

parlay::sequence<long> calculate_limits(size_t upper_bound) {
  parlay::sequence<long> L(6);
  for (float i = 0; i < 6; i++) {
    L[i] = (long)((4 + i) * ((float) upper_bound) * .1);
    //std::cout << L[i - 1] << std::endl;
  }
  //auto limits = parlay::remove_duplicates(L);
  return L; //limits;
}

template<typename PointRange, typename indexType>
void search_and_parse(Graph_ G_,
                      Graph<indexType> &G,
                      PointRange &Base_Points,
                      PointRange &Query_Points,
                      groundTruth<indexType> GT, char* res_file, long k,
                      bool verbose = false,
                      long fixed_beam_width = 0) {
  search_and_parse(G_, G, Base_Points, Query_Points, Base_Points, Query_Points, Base_Points, Query_Points, GT, res_file, k, false, 0u, verbose, fixed_beam_width);
}

template<typename PointRange, typename QPointRange, typename QQPointRange, typename indexType>
void search_and_parse(Graph_ G_,
                      Graph<indexType> &G,
                      PointRange &Base_Points,
                      PointRange &Query_Points,
                      QPointRange &Q_Base_Points,
                      QPointRange &Q_Query_Points,
                      QQPointRange &QQ_Base_Points,
                      QQPointRange &QQ_Query_Points,
                      groundTruth<indexType> GT, char* res_file, long k,
                      bool random = true,
                      indexType start_point = 0,
                      bool verbose = false,
                      long fixed_beam_width = 0,
                      double rerank_factor = 100,
                      double batch_factor = .125) {
  parlay::sequence<nn_result> results;
  std::vector<long> beams;
  std::vector<long> allr;

  auto check = [&] (const long k, const QueryParams QP) {
    return checkRecall(G,
                       Base_Points, Query_Points,
                       Q_Base_Points, Q_Query_Points,
                       QQ_Base_Points, QQ_Query_Points,
                       GT,
                       random,
                       start_point, k, QP, verbose);};

  QueryParams QP(k, 0, G.size(), G.max_degree(), rerank_factor, batch_factor);
  beams = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 24, 26, 28, 30, 32,
    34, 36, 38, 40, 45, 50, 55, 60, 65, 70, 80, 90, 100, 120, 140, 160,
    180, 200, 225, 250, 275, 300, 375, 500, 750, 1000};
  if(k==0) allr = {10};
  else allr = {k};

  if (fixed_beam_width != 0) {
    QP.k = allr[0];
    QP.beamSize = fixed_beam_width;
    for (int i = 0; i < 5; i++)
      check(QP.k, QP);
  } else {
    for (long r : allr) {
      results.clear();
      QP.k = r;
      for (float Q : beams){
        QP.beamSize = Q;
        if (Q >= r){
          results.push_back(check(r, QP));
        }
      }

      // check "limited accuracy"
      parlay::sequence<long> limits = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
                                       22, 23, 24, 25, 26, 28, 30, 35};
      //calculate_limits(results[0].avg_visited);
      //parlay::sequence<long> degree_limits = calculate_limits(G.max_degree());
      //degree_limits.push_back(G.max_degree());
      QP = QueryParams(r, r, (long) G.size(), (long) G.max_degree(),
                       rerank_factor, batch_factor);
      for(long l : limits){
        QP.limit = l;
        QP.beamSize = std::max<long>(l, r);
        //for(long dl : degree_limits){
        QP.degree_limit = std::min<int>(G.max_degree(), 5 * l);
        results.push_back(check(r, QP));
      }
      // check "best accuracy"
      QP = QueryParams((long) 100, (long) 1000, (long) G.size(),
                       (long) G.max_degree(), rerank_factor, batch_factor);
      results.push_back(check(r, QP));

      parlay::sequence<float> buckets =  {.1, .2, .3,  .4,  .5,  .6, .7, .75,  .8, .85,
        .9, .93, .95, .97, .98, .99, .995, .999, .9995,
        .9999, .99995, .99999};
      auto [res, ret_buckets] = parse_result(results, buckets);
      std::cout << std::endl;
      if (res_file != NULL)
        write_to_csv(std::string(res_file), ret_buckets, res, G_);
    }
  }
}

// template<typename Point, typename PointRange, typename indexType>
// void search_and_parse(Graph_ G_,
//                       Graph<indexType> &G,
//                       PointRange &Base_Points,
//                       PointRange &Query_Points,
//                       groundTruth<indexType> GT, char* res_file, long k,
//                       bool random=true, indexType start_point=0,
//                       bool verbose=false) {
//   search_and_parse<Point>(G_, G, Base_Points, Query_Points, Base_Points, Query_Points, GT,
//                           res_file, k, random, start_point, verbose);
// }

} // end namespace

#endif // ALGORITHMS_CHECK_NN_RECALL_H_


================================================
FILE: algorithms/utils/check_range_recall.h
================================================
#include <algorithm>
#include <set>

#include "beamSearch.h"
#include "doublingSearch.h"
#include "rangeSearch.h"
#include "csvfile.h"
#include "parse_results.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "types.h"
#include "stats.h"

namespace parlayANN {

template<typename Point, typename PointRange, typename QPointRange, typename indexType>
void checkRangeRecall(
        Graph<indexType> &G,
        PointRange &Base_Points, PointRange &Query_Points,
        QPointRange &Q_Base_Points, QPointRange &Q_Query_Points,
        RangeGroundTruth<indexType> GT, QueryParams QP,
        long start_point,parlay::sequence<indexType> &active_indices) {

  if(QP.range_query_type == Doubling) {
    
    parlay::internal::timer t;
    float query_time;
    stats<indexType> QueryStats(Query_Points.size());
    parlay::sequence<indexType> start_points = {static_cast<indexType>(start_point)};
    
    auto [all_rr,timings] = DoubleBeamRangeSearch(G,
                                                  Query_Points, Base_Points,
                                                  Q_Query_Points, Q_Base_Points,
                                                  QueryStats, start_points, QP, active_indices);
    query_time = t.next_time();
    auto [beam_search_time, other_time] = timings;
    
    float pointwise_recall = 0.0;
    float reported_results = 0.0;
    float total_results = 0.0;
    float num_nonzero = 0.0;

      //since distances are exact, just have to cross-check number of results
      size_t n = Query_Points.size();
      for (indexType i = 0; i < n; i++) {
        float num_reported_results = all_rr[i].size();
        float num_actual_results = GT[i].size();
        reported_results += num_reported_results;
        total_results += num_actual_results;
        if(num_actual_results != 0) {pointwise_recall += num_reported_results/num_actual_results; num_nonzero++;}
      }
      
      pointwise_recall /= num_nonzero;
      float cumulative_recall = reported_results/total_results;
    
    float QPS = Query_Points.size() / query_time;
    auto stats_ = {QueryStats.dist_stats(), QueryStats.visited_stats()};
    std::cout << "For ";
    QP.print();
    std::cout << ", Point Recall=" << pointwise_recall
              << ", Cum Recall=" << cumulative_recall
              << ", Comparisons=" << QueryStats.dist_stats()[0]
              << ", Visited=" << QueryStats.visited_stats()[0]
              << ", QPS=" << QPS
              << ", ctime=" << (1e9 / (QPS * QueryStats.dist_stats()[0]))
              << ", timings= [" << beam_search_time<< ","<< other_time <<"]"
              << std::endl;
    
  } else if (QP.range_query_type == Greedy || QP.range_query_type == Beam) {

  float query_time;
  stats<indexType> QueryStats(Query_Points.size());
  parlay::sequence<indexType> start_points = {static_cast<indexType>(start_point)};
  parlay::internal::timer t;  

  auto [all_rr, timings] = RangeSearch<Point,PointRange,QPointRange,indexType>(G,
                                                                    Query_Points, Base_Points,
                                                                    Q_Query_Points, Q_Base_Points,
                                                                    QueryStats, start_point, QP);
  auto [beam_search_time, other_time] = timings;
  query_time = t.next_time();

  float pointwise_recall = 0.0;
  float reported_results = 0.0;
  float total_results = 0.0;
  float num_nonzero = 0.0;

  //since distances are exact, just have to cross-check number of results
  size_t n = Query_Points.size();
  for (indexType i = 0; i < n; i++) {
    float num_reported_results = all_rr[i].size();
    float num_actual_results = GT[i].size();
    reported_results += num_reported_results;
    total_results += num_actual_results;
    if(num_actual_results != 0) {pointwise_recall += num_reported_results/num_actual_results; num_nonzero++;}
  }
    
  pointwise_recall /= num_nonzero;
  float cumulative_recall = reported_results/total_results;
  
  float QPS = Query_Points.size() / query_time;
  auto stats_ = {QueryStats.dist_stats(), QueryStats.visited_stats()};
  std::cout << "For ";
  QP.print();
    std::cout << ", Point Recall=" << pointwise_recall
              << ", Cum Recall=" << cumulative_recall
              << ", Comparisons=" << QueryStats.dist_stats()[0]
              << ", Visited=" << QueryStats.visited_stats()[0]
              << ", QPS=" << QPS
              << ", ctime=" << (1e9 / (QPS * QueryStats.dist_stats()[0]))
              << ", timings= [" << beam_search_time<< ","<< other_time <<"]"
              << std::endl;
  }
  else {
    std::cout << "Error: No beam search type provided, -seach_mode should be one of [doubling, greedy, beam]" << std::endl;
  }
}


template<typename Point, typename PointRange, typename QPointRange, typename indexType>
void range_search_wrapper(Graph<indexType> &G,
                          PointRange &Base_Points, PointRange &Query_Points,
                          QPointRange &Q_Base_Points, QPointRange &Q_Query_Points, 
                          RangeGroundTruth<indexType> GT, indexType start_point=0,
                          bool is_early_stopping = false, double esr = 0.0,
                          rangeQueryType rtype = None, double rad = 0.0) {

  std::vector<long> beams;

  beams = {10, 20, 30, 40, 50, 100, 1000, 2000, 3000}; 
  
  long es = 0;

  parlay::sequence<indexType> all = parlay::tabulate(Query_Points.size(), [&] (indexType i){return i;});
  parlay::sequence<double> cumulative_recall;
  parlay::sequence<std::pair<double,double>> timings;
  parlay::sequence<long> beam_size;


  for(long b: beams){
    if (is_early_stopping) 
      es = std::max((long)10, b/4);

    QueryParams QP(b, b, G.size(), G.max_degree(),
                   is_early_stopping, esr, es, rtype, rad);

    
    checkRangeRecall<Point>(G,
                            Base_Points, Query_Points,
                            Q_Base_Points, Q_Query_Points,
                            GT, QP, start_point, all);

  }
  

}

} // end namespace


================================================
FILE: algorithms/utils/csvfile.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

// source: https://gist.github.com/rudolfovich/f250900f1a833e715260a66c87369d15

#pragma once
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>

namespace parlayANN {

class csvfile;

inline static csvfile& endrow(csvfile& file);
inline static csvfile& flush(csvfile& file);

class csvfile {
  std::ofstream fs_;
  bool is_first_;
  const std::string separator_;
  const std::string escape_seq_;
  const std::string special_chars_;

 public:
  csvfile(const std::string filename, const std::string separator = ",")
      : fs_(),
        is_first_(true),
        separator_(separator),
        escape_seq_("\""),
        special_chars_("\"") {
    fs_.exceptions(std::ios::failbit | std::ios::badbit);
    fs_.open(filename, std::ios::app);
  }

  ~csvfile() {
    flush();
    fs_.close();
  }

  void flush() { fs_.flush(); }

  void endrow() {
    fs_ << std::endl;
    is_first_ = true;
  }

  csvfile& operator<<(csvfile& (*val)(csvfile&)) { return val(*this); }

  csvfile& operator<<(const char* val) { return write(escape(val)); }

  csvfile& operator<<(const std::string& val) { return write(escape(val)); }

  template <typename T>
  csvfile& operator<<(const T& val) {
    return write(val);
  }

 private:
  template <typename T>
  csvfile& write(const T& val) {
    if (!is_first_) {
      fs_ << separator_;
    } else {
      is_first_ = false;
    }
    fs_ << val;
    return *this;
  }

  std::string escape(const std::string& val) {
    std::ostringstream result;
    result << '"';
    std::string::size_type to, from = 0u, len = val.length();
    while (from < len && std::string::npos !=
                             (to = val.find_first_of(special_chars_, from))) {
      result << val.substr(from, to - from) << escape_seq_ << val[to];
      from = to + 1;
    }
    result << val.substr(from) << '"';
    return result.str();
  }
};

inline static csvfile& endrow(csvfile& file) {
  file.endrow();
  return file;
}

inline static csvfile& flush(csvfile& file) {
  file.flush();
  return file;
}

} // end namespace


================================================
FILE: algorithms/utils/doublingSearch.h
================================================
#include <algorithm>
#include <functional>
#include <random>
#include <set>
#include <unordered_set>
#include <queue>

#include "parlay/io.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/random.h"
#include "parlay/worker_specific.h"
#include "types.h"
#include "graph.h"
#include "stats.h"
#include "beamSearch.h"
#include "earlyStopping.h"

namespace parlayANN{
  template<typename PointRange,
           typename QPointRange,
           typename indexType>
std::pair<parlay::sequence<parlay::sequence<indexType>>,std::pair<double,double>>
DoubleBeamRangeSearch(Graph<indexType> &G,
                      PointRange &Query_Points, PointRange &Base_Points,
                      QPointRange &Q_Query_Points, QPointRange &Q_Base_Points,
                      stats<indexType> &QueryStats, 
                      parlay::sequence<indexType> starting_points,
                      QueryParams &QP, parlay::sequence<indexType> active_indices) {
  parlay::sequence<parlay::sequence<indexType>> all_neighbors(active_indices.size());
  parlay::WorkerSpecific<double> first_round_time;
  parlay::WorkerSpecific<double> second_round_time;
  bool use_rerank = (Base_Points.params.num_bytes() != Q_Base_Points.params.num_bytes());
  
  parlay::parallel_for(0, active_indices.size(), [&](size_t i) {
    parlay::sequence<indexType> neighbors;
    parlay::internal::timer t_search_first("first round time");
    parlay::internal::timer t_search_other("after first round");
    t_search_first.stop();
    t_search_other.stop();

    t_search_first.start();
    auto P = Query_Points[active_indices[i]];
    auto Q_P = Q_Query_Points[active_indices[i]];
    using dtype = typename decltype(Query_Points[0])::distanceType;
    using id_dist = std::pair<indexType, dtype>;
    QueryParams QP1(QP.beamSize, QP.beamSize,
                    G.size(), G.max_degree(),
                    QP.is_early_stop, Q_P.translate_distance(QP.early_stopping_radius),
                    QP.early_stopping_count,
                    QP.range_query_type, QP.radius);

    auto [pairElts, dist_cmps] = filtered_beam_search(G, Q_P, Q_Base_Points, Q_P, Q_Base_Points,
                                                      starting_points, QP1, false,
                                                      early_stopping<std::vector<id_dist>>);
    auto [beamElts, visitedElts] = pairElts;

    QueryStats.increment_visited(i, visitedElts.size());
    QueryStats.increment_dist(i, dist_cmps);
    
    // rerank and filter out results not within the radius
      for (auto b : beamElts){
        double dist;
        if (use_rerank) {dist = P.distance(Base_Points[b.first]);}
        else {dist = b.second;}
        if (dist <= QP.radius) neighbors.push_back(b.first);
      }

    bool results_smaller_than_beam = false;
    if (neighbors.size() < QP.beamSize)
      results_smaller_than_beam = true;
    
    all_neighbors[i] = std::move(neighbors);

    size_t initial_beam = QP.beamSize * 2;
    // Initialize starting points
    parlay::sequence<indexType> starting_points_idx;

    for (auto s : beamElts) 
      starting_points_idx.push_back(s.first);
    t_search_first.stop();

    t_search_other.start();
    while(!results_smaller_than_beam){
      parlay::sequence<indexType> neighbors;

      QueryParams QP2(initial_beam, initial_beam, 0.0, G.size(), G.max_degree());
      auto [pairElts, dist_cmps] = beam_search(Q_P, G, Q_Base_Points, starting_points_idx, QP2);
      auto [beamElts, visitedElts] = pairElts;

      starting_points_idx.clear();
      for (auto v : beamElts) 
        starting_points_idx.push_back(v.first);

      // rerank and filter out results not within the radius
      for (auto b : beamElts){
        double dist;
        if (use_rerank) {dist = P.distance(Base_Points[b.first]);}
        else {dist = b.second;}
        if (dist <= QP.radius) neighbors.push_back(b.first);
      }

      if (neighbors.size() < initial_beam)
        results_smaller_than_beam = true;

      all_neighbors[i] = neighbors;

      QueryStats.increment_visited(i, visitedElts.size());
      QueryStats.increment_dist(i, dist_cmps);
      initial_beam *= 2;
      neighbors.clear();

    }
    t_search_other.stop();
    *first_round_time += t_search_first.total_time();
    *second_round_time += t_search_other.total_time();
    
  });


  double total_time_first = 0;
  double total_time_second = 0;
  for (auto x : first_round_time) total_time_first += x;
  for (auto y: second_round_time) total_time_second += y;

  return std::make_pair(all_neighbors,std::make_pair(total_time_first,total_time_second));
}
}


================================================
FILE: algorithms/utils/earlyStopping.h
================================================
#pragma once
#include <algorithm>
#include <functional>
#include <random>
#include <set>
#include <unordered_set>
#include <queue>

#include "parlay/io.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/random.h"
#include "beamSearch.h"
#include "types.h"
#include "graph.h"
#include "stats.h"

namespace parlayANN{
  template<typename PointInfo>
  bool early_stopping(const PointInfo& frontier, 
                      const PointInfo& unvisited_frontier,
                      const PointInfo& visited,
                      const QueryParams& QP){
    bool has_visited_enough = (visited.size() >= QP.early_stopping_count);
    bool early_stop = (QP.early_stopping_count > 0); 
    bool has_found_candidate = (frontier[0].second <= QP.radius);
    bool within_early_stop_rad = (unvisited_frontier[0].second <= QP.early_stopping_radius);
    return early_stop && has_visited_enough && !has_found_candidate && !within_early_stop_rad;
    }
}


================================================
FILE: algorithms/utils/euclidian_point.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <algorithm>
#include <iostream>
#include <bitset>

#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/internal/file_map.h"

#include "types.h"
#include "NSGDist.h"
// #include "common/time_loop.h"

#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

namespace parlayANN {

inline float euclidian_distance_(const uint8_t *p, const uint8_t *q, unsigned d) {
  int result = 0;
  for (int i = 0; i < d; i++) {
    result += ((int32_t)((int16_t)q[i] - (int16_t)p[i])) *
      ((int32_t)((int16_t)q[i] - (int16_t)p[i]));
  }
  return (float)result;
}

inline float euclidian_distance(const uint8_t *p, const uint8_t *q, unsigned d) {
  int32_t result = 0;
  for (int i = 0; i < d; i++) {
    int32_t qi = (int32_t) p[i];
    int32_t pi = (int32_t) q[i];
    result += (qi - pi) * (qi - pi);
  }
  return (float)result;
}

inline float euclidian_distance(const uint16_t *p, const uint16_t *q, unsigned d) {
  int64_t result = 0;
  for (int i = 0; i < d; i++) {
    int32_t qi = (int32_t) p[i];
    int32_t pi = (int32_t) q[i];
    result += (qi - pi) * (qi - pi);
  }
  return (float) (result >> 8);
}

inline float euclidian_distance(const int8_t *p, const int8_t *q, unsigned d) {
  int result = 0;
  for (int i = 0; i < d; i++) {
    result += ((int32_t)((int16_t)q[i] - (int16_t)p[i])) *
      ((int32_t)((int16_t)q[i] - (int16_t)p[i]));
  }
  return (float)result;
}

float euclidian_distance(const float *p, const float *q, unsigned d) {
  // efanna2e::DistanceL2 distfunc;
  // return distfunc.compare(p, q, d);
  float result = 0.0;
  for (int i = 0; i < d; i++)
    result += (q[i] - p[i]) * (q[i] - p[i]);
  return (float)result;
}

template<typename T_, long range=(1l << sizeof(T_)*8) - 1>
struct Euclidian_Point {
  using distanceType = float;
  using T = T_;
  using byte = uint8_t;

  struct parameters {
    float slope;
    int32_t offset;
    int dims;
    int num_bytes() const {return dims * sizeof(T);}
    parameters() : slope(0), offset(0), dims(0) {}
    parameters(int dims) : slope(1.0), offset(0), dims(dims) {}
    parameters(float min_val, float max_val, int dims)
      : slope(range / (max_val - min_val)),
        offset((int32_t) round(min_val * slope)),
        dims(dims) {}
  };

  static distanceType d_min() {return 0;}
  static constexpr bool is_metric = true;
  T operator[](long i) const {return *(values + i);}

  float distance(const Euclidian_Point& x) const {
    return euclidian_distance(this->values, x.values, params.dims);
  }

  float translate_distance(double r) const {
    if constexpr (sizeof(T) == 2)
                   return r * params.slope * params.slope / 256;
    else return r * params.slope * params.slope;
  }

  void normalize() {
    double norm = 0.0;
    for (int j = 0; j < params.dims; j++)
      norm += values[j] * values[j];
    norm = std::sqrt(norm);
    if (norm == 0) norm = 1.0;
    for (int j = 0; j < params.dims; j++)
      values[j] = values[j] / norm;
  }

  void prefetch() const {
    int l = (params.dims * sizeof(T) - 1)/64 + 1;
    for (int i=0; i < l; i++)
      __builtin_prefetch((char*) values + i* 64);
  }

  long id() const {return id_;}

  Euclidian_Point() : values(nullptr), id_(-1), params(0) {}

  Euclidian_Point(byte* values, long id, parameters params)
    : values((T*) values), id_(id), params(params) {}

  // template <typename Point>
  // Euclidian_Point(const Point& p, const parameters& params) : id_(-1), params(params) {
  //   float slope = params.slope;
  //   int32_t offset = params.offset;
  //   float min_val = std::floor(offset / slope);
  //   float max_val = std::ceil((range + offset) / slope);
  //   values = new T[params.dims];
  //   if (slope == 1 && offset == 0) {
  //     for (int j = 0; j < params.dims; j++)
  //       values[j] = (T) p[j];
  //   } else {
  //     for (int j = 0; j < params.dims; j++) {
  //       auto x = p[j];
  //       if (x < min_val || x > max_val) {
  //         std::cout << x << " is out of range: [" << min_val << "," << max_val << "]" << std::endl;
  //         abort();
  //       }
  //       int64_t r = (int64_t) (std::round(x * slope)) - offset;
  //       if (r < 0 || r > range) {
  //         std::cout << "out of range: " << r << ", " << range << ", " << x << ", " << std::round(x * slope) - offset << ", " << slope << ", " << offset << std::endl;
  //         abort();
  //       }
  //       values[j] = (T) r;
  //     }
  //   }
  // }

  bool operator==(const Euclidian_Point& q) const {
    for (int i = 0; i < params.dims; i++) {
      if (values[i] != q.values[i]) {
        return false;
      }
    }
    return true;
  }

  bool same_as(const Euclidian_Point& q) const {
    return values == q.values;
  }

  template <typename Point>
  static void translate_point(byte* byte_values, const Point& p, const parameters& params) {
    T* values = (T*) byte_values;
    float slope = params.slope;
    int32_t offset = params.offset;
    if (slope == 1.0 && offset == 00) 
      for (int j = 0; j < params.dims; j++)
        values[j] = p[j];
    else {
      //float min_val = std::floor(offset / slope);
      //float max_val = std::ceil((range + offset) / slope);
      for (int j = 0; j < params.dims; j++) {
        auto x = p[j];
        // if (x < min_val || x > max_val) {
        //   std::cout << x << " is out of range: [" << min_val << "," << max_val << "]" << std::endl;
        //   abort();
        // }
        int64_t r = (int64_t) (std::round(x * slope)) - offset;
        if (r < 0) r = 0;
        if (r > range) r = range;
        // if (r < 0 || r > range) {
        //   std::cout << "out of range: " << r << ", " << range << ", " << x << ", " << std::round(x * slope) - offset << ", " << slope << ", " << offset << std::endl;
        //   abort();
        // }
        values[j] = (T) r;
      }
    }
  }

  template <typename PR>
  static parameters generate_parameters(const PR& pr) {
    long n = pr.size();
    int dims = pr.dimension();
    using MT = float; // typename PR::Point::T;
    parlay::sequence<MT> mins(n, 0.0);
    parlay::sequence<MT> maxs(n, 0.0);
    parlay::sequence<bool> ni(n, true);
    parlay::parallel_for(0, n, [&] (long i) {
      for (int j = 0; j < dims; j++) {
        ni[i] = ni[i] && (pr[i][j] >= 0) && (pr[i][j] - (long) pr[i][j]) == 0;
        mins[i]= std::min<MT>(mins[i], pr[i][j]);
        maxs[i]= std::max<MT>(maxs[i], pr[i][j]);}});
    float min_val = *parlay::min_element(mins);
    float max_val = *parlay::max_element(maxs);
    bool all_ints = *parlay::min_element(ni);
    if (all_ints) {
      if (sizeof(T) == 1 && max_val < 256) max_val = 255;
      else if (sizeof(T) == 2 && max_val < 65536) max_val = 65536;
      min_val = 0;
    }
    std::cout << "scalar quantization: min value = " << min_val
              << ", max value = " << max_val << std::endl;
    return parameters(min_val, max_val, dims);
  }

  parameters params;

private:
  T* values;
  long id_;
};

template <int jl_dims>
struct Euclidean_JL_Sparse_Point {
  using distanceType = float;
  using Data = std::bitset<jl_dims>;
  using byte = uint8_t;
  constexpr static int nz = 6; // number of non_zeros per row
  
  struct parameters {
    std::vector<int> JL_indices;
    int source_dims;
    int num_bytes() const {return sizeof(Data);}
    parameters() : source_dims(0) {}
    parameters(int dims) : source_dims(dims) {}
    parameters(std::vector<int> const& JL_indices,
               int source_dims)
      : JL_indices(JL_indices), source_dims(source_dims) {
      std::cout << "JL sparse quantization, dims = " << jl_dims << std::endl;
    }
  };
  
  static constexpr bool is_metric = false;
  
  int8_t operator [] (long j) const {
    Data* pbits = (Data*) values;
    return (*pbits)[j] ? 1 : -1;}

  float distance(const Euclidean_JL_Sparse_Point &q) const {
    Data* pbits = (Data*) values;
    Data* qbits = (Data*) q.values;
    return (*pbits ^ *qbits).count();
  }

  void prefetch() const {
    int l = (sizeof(Data) - 1)/64 + 1;
    for (int i=0; i < l; i++)
      __builtin_prefetch((char*) values + i* 64);
  }
    
  bool same_as(const Euclidean_JL_Sparse_Point& q){
    return &q == this;
  }

  long id() const {return id_;}

  Euclidean_JL_Sparse_Point(byte* values, long id, const parameters& p)
    : values(values), id_(id) {}

  bool operator==(const Euclidean_JL_Sparse_Point &q) const {
    Data* pbits = (Data*) values;
    Data* qbits = (Data*) q.values;
    return *pbits == *qbits; }

  void normalize() {
    std::cout << "can't normalize quantized point" << std::endl;
    abort();
  }

  template <typename In_Point>
  static void translate_point(byte* values, const In_Point& p, const parameters& params) {
    Data* bits = new (values) Data;
    const std::vector<int>& jli = params.JL_indices;
    for (int i = 0; i < jl_dims; i++) {
      double vv = 0.0;
      for (int j = 0; j < nz/2; j++) 
        vv += (float) p[jli[i * nz + j]];
      for (int j = nz/2; j < nz; j++) 
        vv -= (float) p[jli[i * nz + j]];
      (*bits)[i] = (vv > 0);
    }
  }

  template <typename PR>
  static parameters generate_parameters(const PR& pr) {
    int source_dims = pr.dimension();
    std::vector<int> JL_indices(jl_dims * nz);
    std::mt19937 rng;
    std::uniform_int_distribution<std::mt19937::result_type> dist_i(0,source_dims);
    for (int i = 0; i < jl_dims * nz; i++) {
      JL_indices[i] = dist_i(rng);
    }
    return parameters(JL_indices, source_dims);
  }

private:
  byte* values;
  long id_;
};

struct Euclidean_Bit_Point {
  using distanceType = float;
  using Data = std::bitset<64>;
  using byte = uint8_t;
  
  struct parameters {
    int dims;
    long median;
    int num_bytes() const {return ((dims - 1) / 64 + 1) * 8;}
    parameters() : dims(0) {}
    parameters(int dims, long median)
      : dims(dims), median(median) {
      std::cout << "single-bit quantization with median: " << median << std::endl;
    }
  };
  
  static constexpr bool is_metric = false;
  
  int8_t operator [] (long j) const {
    Data* pbits = (Data*) values;
    return pbits[j/64][j%64];
  }

  float distance(const Euclidean_Bit_Point &q) const {
    int num_blocks = (params.dims - 1)/64 + 1;
    Data* pbits = (Data*) values;
    Data* qbits = (Data*) q.values;
    int cnt = 0;
    for (int i=0; i < num_blocks; i++)
      cnt +=(*pbits ^ *qbits).count();
    return cnt;
  }

  void prefetch() const {
    int l = (params.num_bytes() - 1)/64 + 1;
    for (int i=0; i < l; i++)
      __builtin_prefetch((char*) values + i* 64);
  }
    
  bool same_as(const Euclidean_Bit_Point& q){
    return &q == this;
  }

  long id() const {return id_;}

  Euclidean_Bit_Point(byte* values, long id, const parameters& params)
    : values(values), id_(id), params(params) {}

  bool operator==(const Euclidean_Bit_Point &q) const {
    int num_blocks = (params.dims - 1)/64 + 1;
    Data* pbits = (Data*) values;
    Data* qbits = (Data*) q.values;
    for (int i = 0; i < num_blocks; i++)
      if (pbits[i] != qbits[i]) return false;
    return true;
  }

  void normalize() {
    std::cout << "can't normalize quantized point" << std::endl;
    abort();
  }

  template <typename In_Point>
  static void translate_point(byte* values, const In_Point& p, const parameters& params) {
    Data* pbits = (Data*) values;
    for (int i = 0; i < params.dims; i++)
      pbits[i/64][i%64] = p[i] > params.median;
  }

  template <typename PR>
  static parameters generate_parameters(const PR& pr) {
    long n = pr.size();
    int dims = pr.dimension();
    long len = n * dims;
    parlay::sequence<typename PR::Point::T> vals(len);
    parlay::parallel_for(0, n, [&] (long i) {
      for (int j = 0; j < dims; j++) 
        vals[i * dims + j] = pr[i][j];
    });
    parlay::sort_inplace(vals);
    long median = vals[n*dims/2];
    return parameters(dims, median);
  }

private:
  byte* values;
  long id_;
  parameters params;
};

} // end namespace


================================================
FILE: algorithms/utils/graph.h
================================================
// This code is part of the Parlay Project
// Copyright (c) 2024 Guy Blelloch, Magdalen Dobson and the Parlay team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

//#define Flexible
#pragma once
#ifdef Flexible
#include "simpleGraph.h"
#else

#include <algorithm>
#include <fcntl.h>
#include <iostream>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/internal/file_map.h"

#include "types.h"

namespace parlayANN {
  
template<typename indexType>
struct edgeRange{

  size_t size() const {return edges[0];}

  indexType id() const {return id_;}

  edgeRange() : edges(parlay::make_slice<indexType*, indexType*>(nullptr, nullptr)) {}

  edgeRange(indexType* start, indexType* end, indexType id)
    : edges(parlay::make_slice<indexType*, indexType*>(start,end)), id_(id) {
    maxDeg = edges.size() - 1;
  }

  indexType operator [] (indexType j) const {
    if (j > edges[0]) {
      std::cout << "ERROR: index exceeds degree while accessing neighbors" << std::endl;
      abort();
    } else return edges[j+1];
  }

  void append_neighbor(indexType nbh){
    if (edges[0] == maxDeg) {
      std::cout << "ERROR in append_neighbor: cannot exceed max degree "
                << maxDeg << std::endl;
      abort();
    } else {
      edges[edges[0]+1] = nbh;
      edges[0] += 1;
    }
  }

  template<typename rangeType>
  void update_neighbors(const rangeType& r){
    if (r.size() > maxDeg) {
      std::cout << "ERROR in update_neighbors: cannot exceed max degree "
                << maxDeg << std::endl;
      abort();
    }
    edges[0] = r.size();
    for (int i = 0; i < r.size(); i++) {
      edges[i+1] = r[i];
    }
  }

  template<typename rangeType>
  void append_neighbors(const rangeType& r){
    if (r.size() + edges[0] > maxDeg) {
      std::cout << "ERROR in append_neighbors for point " << id_
                << ": cannot exceed max degree " << maxDeg << std::endl;
      std::cout << edges[0] << std::endl;
      std::cout << r.size() << std::endl;
      abort();
    }
    for (int i = 0; i < r.size(); i++) {
      edges[edges[0] + i + 1] = r[i];
    }
    edges[0] += r.size();
  }

  void clear_neighbors(){
    edges[0] = 0;
  }

  void prefetch() const {
    int l = ((edges[0] + 1) * sizeof(indexType))/64;
    for (int i = 0; i < l; i++)
      __builtin_prefetch((char*) edges.begin() + i *  64);
  }

  template<typename F>
  void sort(F&& less){
    std::sort(edges.begin() + 1, edges.begin() + 1 + edges[0], less);}

  indexType* begin() const {return edges.begin() + 1;}

  indexType* end() const {return edges.begin() + 1 + edges[0];}

private:
  parlay::slice<indexType*, indexType*> edges;
  long maxDeg;
  indexType id_;
};

template<typename indexType_>
struct Graph{
  using indexType = indexType_;
  
  long max_degree() const {return maxDeg;}
  size_t size() const {return n;}

  Graph(){}

  void allocate_graph(long maxDeg, size_t n) {
    long cnt = n * (maxDeg + 1);
    long num_bytes = cnt * sizeof(indexType);
    indexType* ptr = (indexType*) aligned_alloc(1l << 21, num_bytes);
    madvise(ptr, num_bytes, MADV_HUGEPAGE);
    parlay::parallel_for(0, cnt, [&] (long i) {ptr[i] = 0;});
    graph = std::shared_ptr<indexType[]>(ptr, std::free);
  }

  Graph(long maxDeg, size_t n) : maxDeg(maxDeg), n(n) {
    allocate_graph(maxDeg, n);
  }

  Graph(char* gFile){
    std::ifstream reader(gFile);
    if (!reader.is_open()) {
      std::cout << "graph file " << gFile << " not found" << std::endl;
      abort();
    }

    //read num points and max degree
    indexType num_points;
    indexType max_deg;
    reader.read((char*)(&num_points), sizeof(indexType));
    n = num_points;
    reader.read((char*)(&max_deg), sizeof(indexType));
    maxDeg = max_deg;
    std::cout << "Graph: detected " << num_points
              << " points with max degree " << max_deg << std::endl;

    //read degrees and perform scan to find offsets
    indexType* degrees_start = new indexType[n];
    reader.read((char*) (degrees_start), sizeof(indexType) * n);
    indexType* degrees_end = degrees_start + n;
    parlay::slice<indexType*, indexType*> degrees0 =
      parlay::make_slice(degrees_start, degrees_end);
    auto degrees = parlay::tabulate(degrees0.size(), [&] (size_t i){
      return static_cast<size_t>(degrees0[i]);});
    auto [o, total] = parlay::scan(degrees);
    auto offsets = o;
    std::cout << "Total edges read from file: " << total << std::endl;
    offsets.push_back(total);

    allocate_graph(max_deg, n);

    //write 1000000 vertices at a time
    size_t BLOCK_SIZE = 1000000;
    size_t index = 0;
    size_t total_size_read = 0;
    while(index < n){
      size_t g_floor = index;
      size_t g_ceiling = g_floor + BLOCK_SIZE <= n ? g_floor + BLOCK_SIZE : n;
      size_t total_size_to_read = offsets[g_ceiling] - offsets[g_floor];
      indexType* edges_start = new indexType[total_size_to_read];
      reader.read((char*) (edges_start), sizeof(indexType) * total_size_to_read);
      indexType* edges_end = edges_start + total_size_to_read;
      parlay::slice<indexType*, indexType*> edges =
        parlay::make_slice(edges_start, edges_end);
      indexType* gr = graph.get();
      parlay::parallel_for(g_floor, g_ceiling, [&] (size_t i){
        gr[i * (maxDeg + 1)] = degrees[i];
        for(size_t j = 0; j < degrees[i]; j++){
          gr[i * (maxDeg + 1) + 1 + j] = edges[offsets[i] - total_size_read + j];
        }
      });
      total_size_read += total_size_to_read;
      index = g_ceiling;
      delete[] edges_start;
    }
    delete[] degrees_start;
  }

  void save(char* oFile) {
    std::cout << "Writing graph with " << n
              << " points and max degree " << maxDeg
              << " to " << oFile 
              << std::endl;
    parlay::sequence<indexType> preamble =
      {static_cast<indexType>(n), static_cast<indexType>(maxDeg)};
    parlay::sequence<indexType> sizes = parlay::tabulate(n, [&] (size_t i){
      return static_cast<indexType>((*this)[i].size());});
    std::ofstream writer;
    writer.open(oFile, std::ios::binary | std::ios::out);
    writer.write((char*) preamble.begin(), 2 * sizeof(indexType));
    writer.write((char*) sizes.begin(), sizes.size() * sizeof(indexType));
    size_t BLOCK_SIZE = 1000000;
    size_t index = 0;
    while(index < n){
      size_t floor = index;
      size_t ceiling = index + BLOCK_SIZE <= n ? index + BLOCK_SIZE : n;
      auto edge_data = parlay::tabulate(ceiling - floor, [&] (size_t i){
        return parlay::tabulate(sizes[i + floor], [&] (size_t j){
          return (*this)[i + floor][j];});
      });
      parlay::sequence<indexType> data = parlay::flatten(edge_data);
      writer.write((char*)data.begin(), data.size() * sizeof(indexType));
      index = ceiling;
    }
    writer.close();
  }

  edgeRange<indexType> operator [] (indexType i) const {
    if (i > n) {
      std::cout << "ERROR: graph index out of range: " << i << std::endl;
      abort();
    }
    return edgeRange<indexType>(graph.get() + i * (maxDeg + 1),
                                graph.get() + (i + 1) * (maxDeg + 1),
                                i);
  }

  ~Graph(){}

private:
  size_t n;
  long maxDeg;
  std::shared_ptr<indexType[]> graph;
};

} // end namespace
#endif // flexible


================================================
FILE: algorithms/utils/graph_reorder.h
================================================
#include <atomic>
#include <limits>
#include <optional>
#include <tuple>
#include <utility>

#include <parlay/delayed.h>
#include <parlay/parallel.h>
#include <parlay/primitives.h>
#include <parlay/random.h>
#include <parlay/sequence.h>
#include <parlay/utilities.h>
#include <parlay/internal/get_time.h>

using vertex = int;
using w_type = float;
using edge_id = int;

using edge = std::pair<vertex,vertex>;
using w_edge = std::pair<edge,w_type>;
struct tagged_w_type {w_type w; edge_id i;};
bool greater(tagged_w_type a, tagged_w_type b) {
  return (a.w > b.w) ? true : ((a.w == b.w) ? a.i > b.i : false);}
struct vertex_info {
  std::atomic<tagged_w_type> tw;
  vertex size = 1;
};

// Uses recursive graph contraction to renumber a graph
//   E : sequence of weighted edges (only needed in one direction)
//   V : remaining vertices
//   Sizes : keeps size of contracted vertices on the way
//           down the recursion, and offsets on the way up
//   W : used for temporary space to write priorities
//   P : used for temporary space to write parent of contracted vertex
//   m : original number of edges (not currently used)
// Idea: each round of contraction identifies edges (u,v) that maximize
//     w(u,v)/(|u||v|) on both u and v.  These edges are contracted.
//     |u| is the number of vertices in the component and
//     w(u,v) is the number of edges between components u and v.
void recursive_reorder(parlay::sequence<w_edge>& E,
                       parlay::sequence<vertex>& V,
                       parlay::sequence<vertex_info>& W,
                       parlay::sequence<vertex>& P,
                       int i,
                       long m) {
  // std::cout << E.size() << ", " << V.size() << std::endl;
  
  // Base case: need to scan if more than one component
  if (i > 300 || E.size() == 0) { 
    auto vsizes = parlay::tabulate(V.size(), [&] (long i) {return W[V[i]].size;});
    auto [offsets, sum] = parlay::scan(vsizes);
    parlay::parallel_for(0, V.size(), [&] (long i) {W[V[i]].size = offsets[i];});
    return;
  }

  // Write with max into W the priority (w(u,v)/(|u||v|)) to each endpoint
  // Priorities are tagged with id to break ties
  // Must firsrt clear W at all active vertices
  float empty = std::numeric_limits<float>::lowest();
  parlay::for_each(V, [&] (vertex& v) {
      W[v].tw.store(tagged_w_type{empty, 0});});
  parlay::parallel_for(0, E.size(), [&] (edge_id i) {
      auto [u, v] = E[i].first;
      auto w = tagged_w_type{E[i].second / (W[u].size * W[v].size), i};
      parlay::write_min(&(W[v].tw), w, greater);
      parlay::write_min(&(W[u].tw), w, greater);});

  // Check for each active vertex u which edge (u,v) won on it.  If
  // the edge also won on v, then it is matched, we contract v into u,
  // and return the edge along with the old weight of u.
  auto matches = parlay::map_maybe(V, [&] (vertex& u) {
      long i = W[u].tw.load().i;
      if (W[u].tw.load().w != empty && E[i].first.first == u) {
        vertex v = E[i].first.second;
        if (W[v].tw.load().i == i) {
          vertex usize = W[u].size;
          W[u].size += W[v].size;
          P[v] = u;
          return std::optional(std::tuple(u, v, usize));
        }
      }
      return std::optional<std::tuple<vertex,vertex,vertex>>();});
  
  // Update edge endpoints and remove self edges
  E = parlay::map_maybe(E, [&] (w_edge e) {
        auto [u,v] = e.first;
        vertex pu = P[u];
        vertex pv = P[v];
        if (pu > pv) std::swap(pu,pv); // keep oriented low to high
        if (pu == pv) return std::optional<w_edge>();
        return std::optional(w_edge(edge(pu, pv), e.second));});
  
  // Combine redundant edges
  // For efficiency, only do every three steps
  if (i % 4 == 3)
    E = parlay::reduce_by_key(E);

  // These are the remaining vertices after contraction
  V = parlay::filter(V, [&] (vertex v) {return P[v] == v;});
  
  // recurse
  recursive_reorder(E, V, W, P, i+1, m);

  // update Sizes to give right offsets
  parlay::for_each(matches, [&] (auto match) {
        auto [u,v,usize] = match;
        W[v].size = W[u].size + usize;});
}

// E is a sequence of edges, only needed in one direction
// n is the number of vertices
parlay::sequence<vertex> graph_reorder(parlay::sequence<edge>& E, long n) {
  E = parlay::random_shuffle(E); // randomly permute the edges

  // Initialize the five arguments
  auto WE = parlay::map(E, [&] (edge e) { return w_edge(e,1); });
  auto V = parlay::tabulate(n, [] (vertex i) {return i;});
  parlay::sequence<vertex_info> W(n);
  auto P = V;

  // Call main routine
  recursive_reorder(WE, V, W, P, 0, E.size());
  return parlay::map(W, [] (auto& w) {return w.size;});
}


================================================
FILE: algorithms/utils/hashset.h
================================================
#ifndef ALGORITHMS_ANN_HASHSET_H_
#define ALGORITHMS_ANN_HASHSET_H_

#include <vector>
#include <cmath>
namespace parlayANN {

// a hashset that enters integer keys and can give a false negative
// grows as needed
//   hashset x(n); : creates an empty hashset x of initial capacity n
//   x(i) : returns true if i in set, otherwise adds i to set and returns false
  template <typename K>
  struct hashset {
    static constexpr K empty = (K) -1;
    int bits;
    std::vector<K> entries;
    size_t mask = 0;
    long num_entries = 0;
    size_t hash(K const& k) const noexcept {
      return k * UINT64_C(0xbf58476d1ce4e5b9); }

    bool operator () (K a) {
      int loc = hash(a) & mask;
      if (entries[loc] == a) return true;
      if (num_entries > entries.size()/2) {
        bits = bits + 1;
        std::vector<K> new_entries(1ul << bits, empty);
        mask = new_entries.size() - 1;
        swap(entries, new_entries);
        for (auto k : new_entries)
          if (k != empty) {
            int loc = hash(k) & mask;
            while (entries[loc] != empty && entries[loc] != k)
              loc = (loc + 1) & mask;
            entries[loc] = k;
            num_entries++;
          }
      }
      if (entries[loc] != empty) {
        loc = (loc + 1) & mask;
        while (entries[loc] != -1 && entries[loc] != a)
          loc = (loc + 1) & mask;
        if (entries[loc] == a) return true;
      }
      entries[loc] = a;
      num_entries++;
      return false;
    }
  
    hashset(long n) :
      bits(std::ceil(std::log2(n))),
      entries(std::vector<K>((1ul << bits), -1)),
      mask(entries.size() - 1)
    {}
  };

}
#endif // ALGORITHMS_ANN_HASHSET_H_


================================================
FILE: algorithms/utils/jl_point.h
================================================
#pragma once

#include <algorithm>
#include <iostream>
#include <bitset>

#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/internal/file_map.h"
#include "mips_point.h"
#include "types.h"

#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

namespace parlayANN {

template <int jl_dims = 128>
struct Mips_JL_Point {
  using T = int8_t;
  using distanceType = float;
  using Point = Quantized_Mips_Point<8>;
  using Params = typename Point::parameters;
  using byte = uint8_t;
  
  struct parameters {
    std::vector<int8_t> JL_vects;
    Params mips_params;
    int dims;
    int num_bytes() const {return mips_params.dims;}
    parameters() : dims(0) {}
    parameters(int dims) : dims(dims) {}
    parameters(std::vector<int8_t> const& JL_vects, int dims, int d)
      // vectors are normalized so few values will be greater than .3
      : JL_vects(JL_vects), dims(dims), mips_params(.3, d) {}
  };

  static constexpr bool is_metric = false;
  
  T operator [] (long j) const {return pt[j];}

  float distance(const Mips_JL_Point &q) const {
    return pt.distance(q.pt);
  }

  void prefetch() const { pt.prefetch(); }

  bool same_as(const Mips_JL_Point& q){
    return pt.same_as(q.pt);
  }

  long id() const {return pt.id();}

  Mips_JL_Point(byte* values, long id, const parameters& p) 
    : pt(Point(values, id, p.mips_params)), params(&p) {}

  bool operator==(const Mips_JL_Point &q) const {
    return pt == q.pt; }

  void normalize() {
    std::cout << "can't normalize quantized point" << std::endl;
    abort();
  }

  template <typename In_Point>
  static void translate_point(byte* values, const In_Point& p, const parameters& params) {
    int dims = params.dims;
    const std::vector<int8_t>& jlv = params.JL_vects;
    int d = params.mips_params.dims;
    std::vector<float> v(d);
    double nn = 0.0;
    for (int i = 0; i < d; i++) {
      double vv = 0.0;
      for (int j = 0; j < dims; j++) {
        vv += (float) p[j] * (float) jlv[i * dims + j];
      }
      v[i] = vv;
      nn += vv * vv;
    }
    double norm = 1.0 / sqrt(nn);
    for (int i = 0; i < d; i++) {
      v[i] = v[i] * norm;
    }
    
    Point::translate_point(values, v, params.mips_params);
  }

  template <typename PR>
  static parameters generate_parameters(const PR& pr) {
    int dims = pr.dimension();
    std::vector<int8_t> JL_vects(jl_dims * dims);
    std::mt19937 rng;
    std::uniform_int_distribution<std::mt19937::result_type> dist(0,1);
    for (int i = 0; i < jl_dims * dims; i++)
      JL_vects[i] = (dist(rng) == 0) ? -1 : 1;
    return parameters(std::move(JL_vects), dims, jl_dims);
  }

private:
  Point pt;
  const parameters* params;
};

template <int jl_dims>
struct Mips_JL_Bit_Point {
  using distanceType = float;
  using Data = std::bitset<jl_dims>;
  using byte = uint8_t;
  
  struct parameters {
    std::vector<int8_t> JL_vects;
    int source_dims;
    int dims;
    int num_bytes() const {return sizeof(Data);}
    parameters() : source_dims(0) {}
    parameters(int dims) : source_dims(dims) {}
    parameters(std::vector<int8_t> const& JL_vects, int source_dims)
      : JL_vects(JL_vects), source_dims(source_dims), dims(jl_dims) {
      std::cout << "JL dense quantization, dims = " << jl_dims << std::endl;
    }
  };
  
  static constexpr bool is_metric  = false;
  
  int8_t operator [] (long j) const {
    Data* pbits = (Data*) values;
    return (*pbits)[j] ? 1 : -1;}

  float distance(const Mips_JL_Bit_Point &q) const {
    Data* pbits = (Data*) values;
    Data* qbits = (Data*) q.values;
    return (*pbits ^ *qbits).count();
  }

  void prefetch() const {
    int l = (sizeof(Data) - 1)/64 + 1;
    for (int i=0; i < l; i++)
      __builtin_prefetch((char*) values + i* 64);
  }
    
  bool same_as(const Mips_JL_Bit_Point& q){
    return &q == this;
  }

  long id() const {return id_;}

  Mips_JL_Bit_Point(byte* values, long id, const parameters& p)
    : values(values), id_(id) {}

  bool operator==(const Mips_JL_Bit_Point &q) const {
    Data* pbits = (Data*) values;
    Data* qbits = (Data*) q.values;
    return *pbits == *qbits; }

  void normalize() {
    std::cout << "can't normalize quantized point" << std::endl;
    abort();
  }

  template <typename In_Point>
  static void translate_point(byte* values, const In_Point& p, const parameters& params) {
    Data* bits = new (values) Data;
    const std::vector<int8_t>& jlv = params.JL_vects;
    for (int i = 0; i < jl_dims; i++) {
      double vv = 0.0;
      for (int j = 0; j < params.source_dims; j++) {
        vv += (float) p[j] * (float) jlv[i * params.source_dims + j];
      }
      (*bits)[i] = (vv > 0);
    }
  }

  template <typename PR>
  static parameters generate_parameters(const PR& pr) {
    int source_dims = pr.dimension();
    std::vector<int8_t> JL_vects(jl_dims * source_dims);
    std::mt19937 rng;
    std::uniform_int_distribution<std::mt19937::result_type> dist(0,1);
    for (int i = 0; i < jl_dims * source_dims; i++)
      JL_vects[i] = (dist(rng) == 0) ? -1 : 1;
    return parameters(std::move(JL_vects), source_dims);
  }

private:
  byte* values;
  long id_;
};

template <int jl_dims>
struct Mips_JL_Sparse_Point {
  using distanceType = float;
  using Data = std::bitset<jl_dims>;
  using byte = uint8_t;
  constexpr static int nz = 5; // number of non_zeros per row
  
  struct parameters {
    std::vector<int8_t> JL_signs;
    std::vector<int> JL_indices;
    int source_dims;
    int dims;
    int num_bytes() const {return sizeof(Data);}
    parameters() : source_dims(0) {}
    parameters(int dims) : source_dims(dims) {}
    parameters(std::vector<int8_t> const& JL_signs,
               std::vector<int> const& JL_indices,
               int source_dims)
      : JL_signs(JL_signs), JL_indices(JL_indices), source_dims(source_dims), dims(jl_dims) {
      std::cout << "JL sparse quantization, dims = " << jl_dims << std::endl;
    }
  };
  
  static constexpr bool is_metric = false;
  
  int8_t operator [] (long j) const {
    Data* pbits = (Data*) values;
    return (*pbits)[j] ? 1 : -1;}

  float distance(const Mips_JL_Sparse_Point &q) const {
    Data* pbits = (Data*) values;
    Data* qbits = (Data*) q.values;
    return (*pbits ^ *qbits).count();
  }

  void prefetch() const {
    int l = (sizeof(Data) - 1)/64 + 1;
    for (int i=0; i < l; i++)
      __builtin_prefetch((char*) values + i* 64);
  }
    
  bool same_as(const Mips_JL_Sparse_Point& q){
    return &q == this;
  }

  long id() const {return id_;}

  Mips_JL_Sparse_Point(byte* values, long id, const parameters& p)
    : values(values), id_(id) {}

  bool operator==(const Mips_JL_Sparse_Point &q) const {
    Data* pbits = (Data*) values;
    Data* qbits = (Data*) q.values;
    return *pbits == *qbits; }

  void normalize() {
    std::cout << "can't normalize quantized point" << std::endl;
    abort();
  }

  template <typename In_Point>
  static void translate_point(byte* values, const In_Point& p, const parameters& params) {
    Data* bits = new (values) Data;
    const std::vector<int8_t>& jls = params.JL_signs;
    const std::vector<int>& jli = params.JL_indices;
    for (int i = 0; i < jl_dims; i++) {
      double vv = 0.0;
      for (int j = 0; j < nz; j++) 
        vv += (float) p[jli[i * nz + j]] * jls[i * nz + j];
      (*bits)[i] = (vv > 0);
    }
  }

  template <typename PR>
  static parameters generate_parameters(const PR& pr) {
    int source_dims = pr.dimension();
    std::vector<int8_t> JL_signs(jl_dims * nz);
    std::vector<int> JL_indices(jl_dims * nz);
    std::mt19937 rng;
    std::uniform_int_distribution<std::mt19937::result_type> dist_s(0,1);
    std::uniform_int_distribution<std::mt19937::result_type> dist_i(0,source_dims - 1);
    for (int i = 0; i < jl_dims * nz; i++) {
      JL_signs[i] = (dist_s(rng) == 0) ? -1 : 1;
      JL_indices[i] = dist_i(rng);
    }
    return parameters(JL_signs, JL_indices, source_dims);
  }

private:
  byte* values;
  long id_;
};

template <int jl_dims>
struct Mips_JL_Sparse_Point_Normalized {
  using distanceType = float;
  using Data = std::bitset<jl_dims>;
  using byte = uint8_t;
  constexpr static int nz = 5; // number of non_zeros per row
  
  struct parameters {
    std::vector<int8_t> JL_signs;
    std::vector<int> JL_indices;
    int source_dims;
    int dims;
    int num_bytes() const {return sizeof(Data) + sizeof(float);}
    parameters() : source_dims(0) {}
    parameters(int dims) : source_dims(dims) {}
    parameters(std::vector<int8_t> const& JL_signs,
               std::vector<int> const& JL_indices,
               int source_dims)
      : JL_signs(JL_signs), JL_indices(JL_indices), source_dims(source_dims), dims(jl_dims) {
      std::cout << "JL sparse quantization, dims = " << jl_dims << std::endl;
    }
  };
  
  static constexpr bool is_metric = false;
  
  int8_t operator [] (long j) const {
    Data* pbits = (Data*) values;
    return (*pbits)[j] ? 1 : -1;}

  float distance(const Mips_JL_Sparse_Point_Normalized &q) const {
    Data* pbits = (Data*) values;
    Data* qbits = (Data*) q.values;
    float pr = *((float*) (values + sizeof(Data)));
    float qr = *((float*) (q.values + sizeof(Data)));
    return (*pbits ^ *qbits).count() * pr; // * qr;
  }

  void prefetch() const {
    int l = (sizeof(Data) - 1)/64 + 1;
    for (int i=0; i < l; i++)
      __builtin_prefetch((char*) values + i* 64);
  }
    
  bool same_as(const Mips_JL_Sparse_Point_Normalized& q){
    return &q == this;
  }

  long id() const {return id_;}

  Mips_JL_Sparse_Point_Normalized(byte* values, long id, const parameters& p)
    : values(values), id_(id) {}

  bool operator==(const Mips_JL_Sparse_Point_Normalized &q) const {
    Data* pbits = (Data*) values;
    Data* qbits = (Data*) q.values;
    return *pbits == *qbits; }

  void normalize() {
    std::cout << "can't normalize quantized point" << std::endl;
    abort();
  }

  template <typename In_Point>
  static void translate_point(byte* values, const In_Point& p, const parameters& params) {
    Data* bits = new (values) Data;
    float* radius = (float*) (values + sizeof(Data));
    const std::vector<int8_t>& jls = params.JL_signs;
    const std::vector<int>& jli = params.JL_indices;
    double norm = 0.0;
    for (int j = 0; j < params.source_dims; j++)
      norm += p[j] * p[j];
    *radius = std::sqrt(norm);
    if (*radius > 0)
      for (int i = 0; i < jl_dims; i++) {
        double vv = 0.0;
        for (int j = 0; j < nz; j++) 
          vv += (float) p[jli[i * nz + j]] * jls[i * nz + j];
        (*bits)[i] = (vv > 0);
    }
  }

  template <typename PR>
  static parameters generate_parameters(const PR& pr) {
    int source_dims = pr.dimension();
    std::vector<int8_t> JL_signs(jl_dims * nz);
    std::vector<int> JL_indices(jl_dims * nz);
    std::mt19937 rng;
    std::uniform_int_distribution<std::mt19937::result_type> dist_s(0,1);
    std::uniform_int_distribution<std::mt19937::result_type> dist_i(0,source_dims);
    for (int i = 0; i < jl_dims * nz; i++) {
      JL_signs[i] = (dist_s(rng) == 0) ? -1 : 1;
      JL_indices[i] = dist_i(rng);
    }
    return parameters(JL_signs, JL_indices, source_dims);
  }

private:
  byte* values;
  long id_;
};

} // end namespace


================================================
FILE: algorithms/utils/mips_point.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <algorithm>
#include <iostream>
#include <bitset>
#include <bit>

#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/internal/file_map.h"
#include "types.h"

#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include "NSGDist.h"

namespace parlayANN {

  inline float mips_distance(const uint8_t *p, const uint8_t *q, unsigned d) {
    int result = 0;
    for (int i = 0; i < d; i++) {
      result += ((int32_t)q[i]) * ((int32_t)p[i]);
    }
    return -((float)result);
  }

  inline float mips_distance(const int8_t *p, const int8_t *q, unsigned d) {
    int result = 0;
    for (int i = 0; i < d; i++) {
      result += ((int32_t)q[i]) * ((int32_t)p[i]);
    }
    return -((float)result);
  }

  inline float mips_distance(const float *p, const float *q, unsigned d) {
    float result = 0;
    for (int i = 0; i < d; i++) {
      result += (q[i]) * (p[i]);
    }
    return -result;
  }

template<typename T_>
struct Mips_Point {
  using T = T_;
  using distanceType = float;
  using byte = uint8_t;
  //template<typename C, typename range> friend struct Quantized_Mips_Point;

  struct parameters {
    int dims;
    int num_bytes() const {return dims * sizeof(T);}
    parameters() : dims(0) {}
    parameters(int dims) : dims(dims) {}
  };

  static distanceType d_min() {return -std::numeric_limits<float>::max();}
  static constexpr bool is_metric = false;
  T operator [](long i) const {return *(values + i);}

  float distance(const Mips_Point<T>& x) const {
    return mips_distance(this->values, x.values, params.dims);
  }

  float translate_distance(float r) const {
    return r;
  }

  void prefetch() const {
    int l = (params.dims * sizeof(T) - 1)/64 + 1;
    for (int i=0; i < l; i++)
      __builtin_prefetch((char*) values + i* 64);
  }

  long id() const {return id_;}

  Mips_Point() : values(nullptr), id_(-1), params(0) {}

  Mips_Point(byte* values, long id, parameters params)
    : values((T*) values), id_(id), params(params) {}

  bool operator==(const Mips_Point<T>& q) const {
    for (int i = 0; i < params.dims; i++) {
      if (values[i] != q.values[i]) {
        return false;
      }
    }
    return true;
  }

  bool same_as(const Mips_Point<T>& q) const {
    return values == q.values;
  }

  void normalize() {
    double norm = 0.0;
    for (int j = 0; j < params.dims; j++)
      norm += values[j] * values[j];
    norm = std::sqrt(norm);
    if (norm == 0) norm = 1.0;
    float inv_norm = 1.0 / norm;
    for (int j = 0; j < params.dims; j++)
      values[j] = values[j] * inv_norm;
  }

  template <typename Point>
  static void translate_point(byte* values, const Point& p, const parameters& params) {
    for (int j = 0; j < params.dims; j++) ((T*) values)[j] = (T) p[j];
  }

  template <typename PR>
  static parameters generate_parameters(const PR& pr) {
    return parameters(pr.dimension());}

private:
  T* values;
  long id_;
  parameters params;
};

// template<typename T_, bool trim = false, int range = (1 << sizeof(T_)*8) - 1>
// struct Quantized_Mips_Point{
//   using T = T_;
//   using distanceType = float;
//   using byte = uint8_t;
  
//   struct parameters {
//     float max_val;
//     int dims;
//     int num_bytes() const {return dims * sizeof(T);}
//     parameters() : max_val(1), dims(0) {}
//     parameters(int dims) : max_val(1), dims(dims) {}
//     parameters(float max_val, int dims)
//       : max_val(max_val), dims(dims) {}
//   };

//   static distanceType d_min() {return -std::numeric_limits<float>::max();}
//   static bool is_metric() {return false;}
  
//   //T& operator [] (long j) const {if (j >= d) abort(); return *(values+j);}
//   T operator [] (long i) const {return *(values + i);}

//   float distance(int8_t* p, int8_t* q) const {
//     int32_t result = 0;
//     for (int i = 0; i < params.dims; i++){
//       result += (int16_t) p[i] * (int16_t) q[i];
//     }
//     //return (float) (r * r - result);
//     return (float) -result;
//   }

//   float distance(int16_t* p, int16_t* q) const {
//     int64_t result = 0;
//     for (int i = 0; i < params.dims; i++){
//       result += (int32_t) p[i] * (int32_t) q[i];
//     }
//     return (float) -result;
//   }

//   float distance(const Quantized_Mips_Point &x) const {
//     return distance(this->values, x.values);
//   }

//   void prefetch() const {
//     int l = (params.dims * sizeof(T) - 1)/64 + 1;
//     for (int i=0; i < l; i++)
//       __builtin_prefetch(values + i * 64);
//   }

//   bool same_as(const Quantized_Mips_Point& q){
//     return values == q.values;
//   }

//   long id() const {return id_;}

//   Quantized_Mips_Point(byte* values, long id, parameters p)
//     : values((T*) values), id_(id), params(p)
//   {}

//   bool operator==(const Quantized_Mips_Point &q) const {
//     for (int i = 0; i < params.dims; i++) {
//       if (values[i] != q.values[i]) {
//         return false;
//       }
//     }
//     return true;
//   }

//   void normalize() {
//     std::cout << "can't normalize quantized point" << std::endl;
//     abort();
//   }

//   template <typename Point>
//   static void translate_point(byte* byte_values, const Point& p, const parameters& params) {
//     T* values = (T*) byte_values;
//     for (int j = 0; j < params.dims; j++) {
//       float mv = params.max_val;
//       float pj = p[j];
//       if (pj < -mv) values[j] = - range/2 - 1;
//       else if (pj > mv) values[j] = range/2;
//       else {
//         //if (pj < -mv || pj > mv) {
//         //std::cout << pj << " is out of range, should be in [" << -mv << ":" << mv << "] " << std::endl;
//         //abort();
//         //}
//         int32_t x = std::round(pj * (range/2) / mv);
//         values[j] = (T) x;
//       }
//     }
//   }

//   template <typename PR>
//   static parameters generate_parameters(const PR& pr) {
//     long n = pr.size();
//     int dims = pr.dimension();
//     long len = n * dims;
//     parlay::sequence<typename PR::T> vals(len);
//     parlay::parallel_for(0, n, [&] (long i) {
//       for (int j = 0; j < dims; j++) 
//         vals[i * dims + j] = pr[i][j];
//     });
//     parlay::sort_inplace(vals);
//     float min_val, max_val;
//     if (trim) {
//       float cutoff = .0001;
//       min_val = vals[(long) (cutoff * len)];
//       max_val = vals[(long) ((1.0-cutoff) * (len-1))];
//     } else {
//       min_val = vals[0];
//       max_val = vals[len-1];
//     }
//     float bound = std::max(max_val, -min_val);

//     // parlay::sequence<typename PR::T> mins(n);
//     // parlay::sequence<typename PR::T> maxs(n);
//     // parlay::parallel_for(0, n, [&] (long i) {
//     //   mins[i] = 0.0;
//     //   maxs[i] = 0.0;
//     //   for (int j = 0; j < dims; j++) {
//     //     mins[i]= std::min(mins[i], pr[i][j]);
//     //     maxs[i]= std::max(maxs[i], pr[i][j]);}});
//     // float min_val = *parlay::min_element(mins);
//     // float max_val = *parlay::max_element(maxs);
//     // float bound = std::max(max_val, -min_val);
    
    
//     // if (sizeof(T) == 1) {
//     //   auto x = parlay::flatten(parlay::tabulate(n, [&] (long i) {
//     //     return parlay::tabulate(dims, [&] (long j) {
//     //       return 128 + (int8_t) (std::round(pr[i][j] * (range/2) / bound));});}));
//     //   auto y = parlay::histogram_by_index(x, 256);
//     //   for (int i = 0; i < 256; i++)
//     //     std::cout << i - 128 << ":" << y[i] << ", ";
//     //   std::cout << std::endl;
//     // }
//     std::cout << "scalar quantization: min value = " << min_val
//               << ", max value = " << max_val << std::endl;
//     return parameters(bound, dims); // 1.7 for glove-100, 1.4 for nytimes, 1.5 for glove-25 but bad
//   }

// private:
//   T* values;
//   long id_;
//   parameters params;
// };

template<int bits, bool trim = false, int range = (1 << bits) - 1>
struct Quantized_Mips_Point{
  using T = int16_t;
  using distanceType = float; 
  using byte = uint8_t;
  
  struct parameters {
    float max_val;
    int dims;
    float scale;
    int num_bytes() const {return (dims * bits - 1) / 8 + 1;}
    parameters() : max_val(1), dims(0) {}
    parameters(int dims) : max_val(1), dims(dims) {}
    parameters(float max_val, int dims)
      : max_val(max_val), dims(dims), scale((range/2) / max_val) {}
  };

  static constexpr bool is_metric = false;
  
  int operator [] (long i) const {
    if constexpr (bits <= 4) {
      if (i & 1)
        return ((int8_t) (values[i/2] & 240)) >> 4;
      else
        return ((int8_t) (values[i/2] << 4)) >> 4;
    } else {
      if constexpr (bits <= 8) {
        return *(((int8_t*) values) + i);
      } else {
        return *(((int16_t*) values) + i);
      }
    }
  }


  distanceType distance_16(byte* p_, byte* q_) const {
    int16_t* p = (int16_t*) p_;
    int16_t* q = (int16_t*) q_;
    int64_t result = 0;
    for (int i = 0; i < params.dims; i++){
      result += (int32_t) p[i] * (int32_t) q[i];
    }
    return (distanceType) -result;
  }

  distanceType distance_8(byte* p_, byte* q_) const {
    int8_t* p = (int8_t*) p_;
    int8_t* q = (int8_t*) q_;
    int32_t result = 0;
    for (int i = 0; i < params.dims; i++){
      result += (int16_t) p[i] * (int16_t) q[i];
    }
    return (distanceType) -result;
  }

  distanceType distance_4(byte* p_, byte* q_) const {
    int8_t* p = (int8_t*) p_;
    int8_t* q = (int8_t*) q_;
    int32_t result = 0;
    int8_t mask = -16; // bit representation is 11110000, used as mask to extract high 4 bits
    for (int i = 0; i < params.dims/2; i++) {
      result += (int16_t) ((int8_t) (p[i] << 4)) * (int16_t) ((int8_t) (q[i] << 4));
    }
    for (int i = 0; i < params.dims/2; i++){
      result += (int16_t) (p[i] & mask) * (int16_t) (q[i] & mask);
    }
    return (distanceType) -result;
  }

  distanceType distance(const Quantized_Mips_Point &x) const {
    if constexpr (bits <= 4) {
      return distance_4(this->values, x.values);
    } else {
      if constexpr (bits <= 8) {
        return distance_8(this->values, x.values);
      } else {
        return distance_16(this->values, x.values);
      }
    }
  }

  float translate_distance(float r) const {
    return r * params.scale * params.scale;
  }
  
  void prefetch() const {
    int l = (params.num_bytes() - 1)/64 + 1;
    for (int i=0; i < l; i++)
      __builtin_prefetch(values + i * 64);
  }

  bool same_as(const Quantized_Mips_Point& q){
    return values == q.values;
  }

  long id() const {return id_;}

  Quantized_Mips_Point(byte* values, long id, parameters p)
    : values(values), id_(id), params(p)
  {}

  bool operator==(const Quantized_Mips_Point &q) const {
    for (int i = 0; i < params.dims; i++) {
      if (values[i] != q.values[i]) {
        return false;
      }
    }
    return true;
  }

  void normalize() {
    std::cout << "can't normalize quantized point" << std::endl;
    abort();
  }

  static void assign(byte* values, int i, int v) {
    if constexpr (bits <= 4) {
      byte* p = values + i/2;
      if (i & 1) {
        *p = (*p & 15) | (v << 4);
      } else {
        *p = (*p & 240) | v;
      }
    } else {
      if constexpr (bits <= 8) {
        ((int8_t*) values)[i] = (int8_t) v;
      } else {
        ((int16_t*) values)[i] = (int16_t) v;
      }
    }
  }
  
  template <typename Point>
  static void translate_point(byte* byte_values, const Point& p, const parameters& params) {
    for (int j = 0; j < params.dims; j++) {
      float mv = params.max_val;
      float scale = params.scale; //(range/2) / mv;
      float pj = p[j];
      // cap if underflow or overflow
      if (pj < -mv) assign(byte_values, j, - range/2); // - 1);
      else if (pj > mv) assign(byte_values, j, range/2);
      else {
        int32_t v = std::round(pj * scale); 
        assign(byte_values, j, v);
      }
    }
  }

  
  template <typename PR>
  static parameters generate_parameters(const PR& pr) {
    long n = pr.size();
    int dims = pr.dimension();
    float min_val, max_val;
    auto min_per_point = parlay::delayed_tabulate(n, [&](size_t i) {
      float min = 0.0;
      auto p = pr[i];
      for (int j = 0; j < dims; j++) min = std::min<float>(min, p[j]);
      return min;
    });
    auto max_per_point = parlay::delayed_tabulate(n, [&](size_t i) {
      float max = 0.0;
      auto p = pr[i];
      for (int j = 0; j < dims; j++) max = std::max<float>(max, p[j]);
      return max;
    });
    if (trim) {
      double cutoff = .0001;
      size_t min_rank = cutoff * n;
      size_t max_rank = (1.0 - cutoff) * (n - 1);

      min_val = parlay::kth_smallest_copy(min_per_point, min_rank);
      max_val = parlay::kth_smallest_copy(max_per_point, max_rank);
      std::cout << "mips scalar quantization to " << bits
                << " bits. trimmed to: min = " << min_val
                << ", max = " << max_val << std::endl;
    } else {
      min_val = parlay::reduce(min_per_point, parlay::minm<float>());
      max_val = parlay::reduce(max_per_point, parlay::maxm<float>());
      std::cout << "mips scalar quantization to " << bits
                << " bits: min value = " << min_val
                << ", max value = " << max_val << std::endl;
    }
    float bound = std::max(max_val, -min_val);

    // parlay::sequence<typename PR::T> mins(n);
    // parlay::sequence<typename PR::T> maxs(n);
    // parlay::parallel_for(0, n, [&] (long i) {
    //   mins[i] = 0.0;
    //   maxs[i] = 0.0;
    //   for (int j = 0; j < dims; j++) {
    //     mins[i]= std::min(mins[i], pr[i][j]);
    //     maxs[i]= std::max(maxs[i], pr[i][j]);}});
    // float min_val = *parlay::min_element(mins);
    // float max_val = *parlay::max_element(maxs);
    // float bound = std::max(max_val, -min_val);
    
    
    // if (sizeof(T) == 1) {
    //   auto x = parlay::flatten(parlay::tabulate(n, [&] (long i) {
    //     return parlay::tabulate(dims, [&] (long j) {
    //       return 128 + (int8_t) (std::round(pr[i][j] * (range/2) / bound));});}));
    //   auto y = parlay::histogram_by_index(x, 256);
    //   for (int i = 0; i < 256; i++)
    //     std::cout << i - 128 << ":" << y[i] << ", ";
    //   std::cout << std::endl;
    // }
    return parameters(bound, dims); // 1.7 for glove-100, 1.4 for nytimes, 1.5 for glove-25 but bad
  }

private:
  byte* values;
  long id_;
  parameters params;
};


struct Mips_2Bit_Point {
  using distanceType = float;
  using byte = uint8_t;
  using word = std::bitset<64>;
  //using word = uint64_t; 
  using T = int8_t;

  static int pop_count(word x) {
    return x.count();
    //return __builtin_popcountl(x);
  }

  static void set_bit(word& x, int i, bool v) {
    x[i] = v;
    //x = (~(1ul << i) & x) | ((uint64_t) v << i);
  }
  
  struct parameters {
    float cut;
    int dims;
    int num_bytes() const {return ((dims - 1) / 64 + 1) * 8 * 2;}
    parameters() : cut(.25), dims(0) {}
    parameters(int dims) : cut(.25), dims(dims) {}
    parameters(float cut, int dims)
      : cut(cut), dims(dims) {
      std::cout << "3-value quantization with cut = " << cut << std::endl;
    }
  };

  static constexpr bool is_metric = false;
  
  int operator [] (long i) const {
    abort();
  }

  float distance_8(byte* p_, byte* q_) const {
    word* p = (word*) p_;
    word* q = (word*) q_;
    int num_blocks = params.num_bytes() / 16;
    int16_t total = 0;
    for (int i = 0; i < num_blocks; i++) {
      word not_equal = p[2 * i] ^ q[2 * i];
      word not_zero = p[2 * i + 1] & q[2 * i + 1];
      int16_t num_neg = pop_count(not_equal & not_zero);
      int16_t num_not_zero = pop_count(not_zero);
      total += (2 * num_neg) - num_not_zero;
    }
    return total;
  }

  float distance(const Mips_2Bit_Point &x) const {
    return distance_8(this->values, x.values);
  }
  
  void prefetch() const {
    int l = (params.num_bytes() - 1)/64 + 1;
    for (int i=0; i < l; i++)
      __builtin_prefetch(values + i * 64);
  }

  bool same_as(const Mips_2Bit_Point& q){
    return values == q.values;
  }

  long id() const {return id_;}

  Mips_2Bit_Point(byte* values, long id, parameters p)
    : values(values), id_(id), params(p)
  {}

  bool operator==(const Mips_2Bit_Point &q) const {
    for (int i = 0; i < params.num_bytes(); i++) {
      if (values[i] != q.values[i]) {
        return false;
      }
    }
    return true;
  }

  void normalize() {
    std::cout << "can't normalize quantized point" << std::endl;
    abort();
  }
  
  template <typename Point>
  static void translate_point(byte* byte_values, const Point& p, const parameters& params) {
    // two words per block, one for -1, +1, the other to mark if non-zero
    int num_blocks = params.num_bytes() / 16;
    word* words = (word*) byte_values;
    float cv = params.cut;
    for (int i = 0; i < num_blocks; i++) {
      for (int j = 0; j < 64; j++) {
        if (j + i * 64 >= params.dims) {
          set_bit(words[2 * i + 1], j, false);
          return;
        }
        set_bit(words[2 * i + 1], j, true);
        float pj = p[j + i * 64];
        if (pj < -cv) set_bit(words[2 * i], j, false);
        else if (pj > cv) set_bit(words[2 * i], j, true);
        else set_bit(words[2 * i + 1], j, false);
      }
    }
  }
  
  template <typename PR>
  static parameters generate_parameters(const PR& pr) {
    long n = pr.size();
    int dims = pr.dimension();
    long len = n * dims;
    using MT = float;
    parlay::sequence<MT> vals(len);
    parlay::parallel_for(0, n, [&] (long i) {
      for (int j = 0; j < dims; j++) 
        vals[i * dims + j] = pr[i][j];
    });
    parlay::sort_inplace(vals);
    float cutoff = .3;
    float min_cut = vals[(long) (cutoff * len)];
    float max_cut = vals[(long) ((1.0-cutoff) * (len-1))];
    float cut = std::max(max_cut, -min_cut);
    return parameters(cut, dims); 
  }

private:
  byte* values;
  long id_;
  parameters params;
};

struct Mips_Bit_Point {
  using distanceType = float;
  using Data = std::bitset<64>;
  using byte = uint8_t;
  
  struct parameters {
    int dims;
    int num_bytes() const {return ((dims - 1) / 64 + 1) * 8;}
    parameters() : dims(0) {}
    parameters(int dims)
      : dims(dims) {
      std::cout << "single-bit quantization" << std::endl;
    }
  };
  
  static constexpr bool is_metric = false;
  
  int8_t operator [] (long j) const {
    Data* pbits = (Data*) values;
    return pbits[j/64][j%64] ? 1 : -1;
  }

  float distance(const Mips_Bit_Point &q) const {
    int num_blocks = (params.dims - 1)/64 + 1;
    Data* pbits = (Data*) values;
    Data* qbits = (Data*) q.values;
    int cnt = 0;
    for (int i=0; i < num_blocks; i++)
      cnt +=(*pbits ^ *qbits).count();
    return cnt;
  }

  void prefetch() const {
    int l = (params.num_bytes() - 1)/64 + 1;
    for (int i=0; i < l; i++)
      __builtin_prefetch((char*) values + i* 64);
  }
    
  bool same_as(const Mips_Bit_Point& q){
    return &q == this;
  }

  long id() const {return id_;}

  Mips_Bit_Point(byte* values, long id, const parameters& params)
    : values(values), id_(id), params(params) {}

  bool operator==(const Mips_Bit_Point &q) const {
    int num_blocks = (params.dims - 1)/64 + 1;
    Data* pbits = (Data*) values;
    Data* qbits = (Data*) q.values;
    for (int i = 0; i < num_blocks; i++)
      if (pbits[i] != qbits[i]) return false;
    return true;
  }

  void normalize() {
    std::cout << "can't normalize quantized point" << std::endl;
    abort();
  }

  template <typename In_Point>
  static void translate_point(byte* values, const In_Point& p, const parameters& params) {
    Data* pbits = (Data*) values;
    for (int i = 0; i < params.dims; i++)
      pbits[i/64][i%64] = (p[i] > 0);
  }

  template <typename PR>
  static parameters generate_parameters(const PR& pr) {
    return parameters(pr.dimension());
  }

private:
  byte* values;
  long id_;
  parameters params;
};


struct Mips_4Bit_Point {
  using distanceType = float;
  using byte = uint8_t;
  using word = std::bitset<64>;
  //using word = uint64_t; 
  using T = int8_t;

  static int pop_count(word x) {
    return x.count();
  }

  static void set_bit(word& x, int i, bool v) {
    x[i] = v;
  }
  
  struct parameters {
    float cut;
    int dims;
    int num_bytes() const {return ((dims - 1) / 64 + 1) * 8 * 4;}
    parameters() : cut(.25), dims(0) {}
    parameters(int dims) : cut(.25), dims(dims) {}
    parameters(float cut, int dims)
      : cut(cut), dims(dims) {
      std::cout << "3-value quantization with cut = " << cut << std::endl;
    }
  };

  static constexpr bool is_metric = false;
  
  int operator [] (long i) const {
    abort();
  }

  static int16_t triple(word a, word b, word plus, word minus) {
    word x = a & b;
    return pop_count(x & plus) - pop_count(x & minus);
  }
      
  float distance(byte* p_, byte* q_) const {
    word* p = (word*) p_;
    word* q = (word*) q_;
    int num_blocks = params.num_bytes() / 16;
    int16_t total = 0;
    for (int i = 0; i < num_blocks; i++) {
      word minus = p[2 * i] ^ q[2 * i];
      word plus = ~minus;
      auto triple = [=] (word a, word b) -> int16_t {
        word x = a & b;
        return pop_count(x & plus) - pop_count(x & minus);
      };
      total += triple(p[2 * i + 1], q[2 * i + 1]);
      total += triple(p[2 * i + 1], q[2 * i + 2]) * 2;
      total += triple(p[2 * i + 1], q[2 * i + 3]) * 4;
      total += triple(p[2 * i + 2], q[2 * i + 1]) * 2;
      total += triple(p[2 * i + 2], q[2 * i + 2]) * 4;
      total += triple(p[2 * i + 2], q[2 * i + 3]) * 8;
      total += triple(p[2 * i + 3], q[2 * i + 1]) * 4;
      total += triple(p[2 * i + 3], q[2 * i + 2]) * 8;
      total += triple(p[2 * i + 3], q[2 * i + 3]) * 16;
    }
    return total;
  }

  float distance(const Mips_4Bit_Point &x) const {
    return distance(this->values, x.values);
  }
  
  void prefetch() const {
    int l = (params.num_bytes() - 1)/64 + 1;
    for (int i=0; i < l; i++)
      __builtin_prefetch(values + i * 64);
  }

  bool same_as(const Mips_4Bit_Point& q){
    return values == q.values;
  }

  long id() const {return id_;}

  Mips_4Bit_Point(byte* values, long id, parameters p)
    : values(values), id_(id), params(p)
  {}

  bool operator==(const Mips_4Bit_Point &q) const {
    for (int i = 0; i < params.num_bytes(); i++) {
      if (values[i] != q.values[i]) {
        return false;
      }
    }
    return true;
  }

  void normalize() {
    std::cout << "can't normalize quantized point" << std::endl;
    abort();
  }
  
  template <typename Point>
  static void translate_point(byte* byte_values, const Point& p, const parameters& params) {
    // two words per block, one for -1, +1, the other to mark if non-zero
    int num_blocks = params.num_bytes() / 16;
    word* words = (word*) byte_values;
    float cv = params.cut;
    for (int i = 0; i < num_blocks; i++) {
      for (int j = 0; j < 64; j++) {
        if (j + i * 64 >= params.dims) {
          set_bit(words[2 * i + 1], j, false);
          return;
        }
        set_bit(words[2 * i + 1], j, true);
        float pj = p[j + i * 64];
        if (pj < -cv) set_bit(words[2 * i], j, false);
        else if (pj > cv) set_bit(words[2 * i], j, true);
        else set_bit(words[2 * i + 1], j, false);
      }
    }
  }
  
  template <typename PR>
  static parameters generate_parameters(const PR& pr) {
    long n = pr.size();
    int dims = pr.dimension();
    long len = n * dims;
    using MT = float;
    parlay::sequence<MT> vals(len);
    parlay::parallel_for(0, n, [&] (long i) {
      for (int j = 0; j < dims; j++) 
        vals[i * dims + j] = pr[i][j];
    });
    parlay::sort_inplace(vals);
    float cutoff = .3;
    float min_cut = vals[(long) (cutoff * len)];
    float max_cut = vals[(long) ((1.0-cutoff) * (len-1))];
    float cut = std::max(max_cut, -min_cut);
    return parameters(cut, dims); 
  }

private:
  byte* values;
  long id_;
  parameters params;
};

} // end namespace


================================================
FILE: algorithms/utils/mmap.h
================================================
#ifndef ALGORITHMS_ANN_MMAP_H_
#define ALGORITHMS_ANN_MMAP_H_

#include <algorithm>
#include <iostream>

#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/internal/file_map.h"

namespace parlayANN {

// returns a pointer and a length
inline std::pair<char*, size_t> mmapStringFromFile(const char* filename) {
  struct stat sb;
  int fd = open(filename, O_RDONLY);
  if (fd == -1) {
    perror("open");
    exit(-1);
  }
  if (fstat(fd, &sb) == -1) {
    perror("fstat");
    exit(-1);
  }
  if (!S_ISREG(sb.st_mode)) {
    perror("not a file\n");
    exit(-1);
  }
  char* p =
      static_cast<char*>(mmap(0, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0));
  if (p == MAP_FAILED) {
    perror("mmap");
    exit(-1);
  }
  if (close(fd) == -1) {
    perror("close");
    exit(-1);
  }
  size_t n = sb.st_size;
  return std::make_pair(p, n);
}

} // end namespace

#endif // ANN_MMAP_H_


================================================
FILE: algorithms/utils/parse_results.h
================================================
#ifndef ALGORITHMS_UTILS_PARSE_RESULTS_H_
#define ALGORITHMS_UTILS_PARSE_RESULTS_H_

#include <algorithm>
#include <set>

#include "parlay/parallel.h"
#include "parlay/primitives.h"

namespace parlayANN {
  
struct Graph_ {
  std::string name;
  std::string params;
  long size;
  double avg_deg;
  int max_deg;
  double time;

  Graph_(std::string n, std::string p, long s, double ad, int md, double t)
      : name(n), params(p), size(s), avg_deg(ad), max_deg(md), time(t) {}

  void print() {
    std::cout << name << " graph built with " << size
              << " points and parameters " << params << std::endl;
    std::cout << "Graph has average degree " << avg_deg
              << " and maximum degree " << max_deg << std::endl;
    std::cout << "Graph built in " << time << " seconds" << std::endl;
  }
};

struct LSH {
  std::string name;
  std::string params;
  long size;
  double time;

  LSH(std::string n, std::string p, long s, double t)
      : name(n), params(p), size(s), time(t) {}

  void print() {
    std::cout << name << " LSH tables built with " << size
              << " points and parameters " << params << std::endl;
    std::cout << "Tables built in " << time << " seconds" << std::endl;
  }
};

struct range_result {
  int num_queries;
  int num_nonzero_queries;

  double recall;
  double alt_recall;

  size_t avg_cmps;
  size_t tail_cmps;

  size_t avg_visited;
  size_t tail_visited;

  float QPS;

  int k;
  int beamQ;
  double slack;

  range_result(int nq, int nnq, double r, double r2,
               parlay::sequence<size_t> stats, float qps, int K, int Q, float c,
               float s)
      : num_queries(nq),
        num_nonzero_queries(nnq),
        recall(r),
        alt_recall(r2),
        QPS(qps),
        k(K),
        beamQ(Q),
        slack(s) {
    if (stats.size() != 4) abort();

    avg_cmps = stats[0];
    tail_cmps = stats[1];
    avg_visited = stats[2];
    tail_visited = stats[3];
  }

  void print() {
    std::cout << "k = " << k << ", Q = " << beamQ
              << ", slack = " << slack << ", throughput = " << QPS << "/second"
              << std::endl;
    std::cout << std::endl;
    std::cout << "Num nonzero queries: " << num_nonzero_queries << std::endl;
    std::cout << "Nonzero recall: " << recall << std::endl;
    std::cout << "Alternate recall: " << alt_recall;
    std::cout << std::endl;
    std::cout << "Average dist cmps: " << avg_cmps
              << ", 99th percentile dist cmps: " << tail_cmps << std::endl;
    std::cout << "Average num visited: " << avg_visited
              << ", 99th percentile num visited: " << tail_visited << std::endl;
  }
};

struct nn_result {
  double recall;

  uint avg_cmps;
  uint tail_cmps;

  uint avg_visited;
  uint tail_visited;

  float QPS;

  int k;
  int beamQ;
  int limit;
  int degree_limit;
  int gtn;

  long num_queries;

  nn_result(double r, parlay::sequence<uint> stats, float qps, int K, int Q,
            long q, int limit, int degree_limit, int gtn)
      : recall(r),
        QPS(qps),
        k(K),
        beamQ(Q),
        limit(limit),
        degree_limit(degree_limit),
        gtn(gtn),
        num_queries(q) {
    if (stats.size() != 4) abort();

    avg_cmps = stats[0];
    tail_cmps = stats[1];
    avg_visited = stats[2];
    tail_visited = stats[3];
  }

  void print() {
    std::cout << "For " << gtn << "@" << gtn << " recall = " << recall
              << ", QPS = " << QPS << ", Q = " << beamQ;
    std::cout << ", visited limit = " << limit << ", degree limit: " << degree_limit;
    std::cout << ", average visited = " << avg_visited << ", average cmps = " << avg_cmps << std::endl;
  }

  void print_verbose() {
    std::cout << "Over " << num_queries << " queries" << std::endl;
    std::cout << "k = " << k << ", Q = " << beamQ 
              << ", throughput = " << QPS << "/second" << std::endl;
    std::cout << "Recall: " << recall << std::endl;
    std::cout << "Average dist cmps: " << avg_cmps
              << ", 99th percentile dist cmps: " << tail_cmps << std::endl;
    std::cout << "Average num visited: " << avg_visited
              << ", 99th percentile num visited: " << tail_visited << std::endl;
  }
};

struct lsh_result {
  double recall;

  size_t avg_cmps;
  size_t tail_cmps;

  float QPS;

  int k;
  int num_tables;
  long num_queries;

  lsh_result(double r, parlay::sequence<size_t> stats, float qps, int K, int n,
             long q)
      : recall(r), QPS(qps), k(K), num_tables(n), num_queries(q) {
    if (stats.size() != 2) abort();
    avg_cmps = stats[0];
    tail_cmps = stats[1];
  }

  void print() {
    std::cout << "Over " << num_queries << " queries" << std::endl;
    std::cout << "k = " << k << ", tables = " << num_tables
              << ", throughput = " << QPS << "/second" << std::endl;
    std::cout << "Recall: " << recall << std::endl;
    std::cout << "Average dist cmps: " << avg_cmps
              << ", 99th percentile dist cmps: " << tail_cmps << std::endl;
  }
};

template <typename res>
auto parse_result(parlay::sequence<res> results,
                  parlay::sequence<float> buckets) {
  parlay::sequence<float> ret_buckets;
  parlay::sequence<res> retval;
  for (int i = 0; i < buckets.size(); i++) {
    float b = buckets[i];
    auto pred = [&](res R) { return R.recall >= b; };
    parlay::sequence<res> candidates;
    auto temp_candidates = parlay::filter(results, pred);
    if ((i == buckets.size() - 1) || (temp_candidates.size() == 0)) {
      candidates = temp_candidates;
    } else {
      float c = buckets[i + 1];
      auto pred2 = [&](res R) { return R.recall <= c; };
      candidates = parlay::filter(temp_candidates, pred2);
    }
    if (candidates.size() != 0) {
      auto less = [&](res R, res S) { return R.QPS < S.QPS; };
      res M = *(parlay::max_element(candidates, less));
      M.print();
      retval.push_back(M);
      ret_buckets.push_back(b);
    }
  }
  return std::make_pair(retval, ret_buckets);
}

} // end namespace

#endif  // ALGORITHMS_UTILS_PARSE_RESULTS_H_


================================================
FILE: algorithms/utils/point_range.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <sys/mman.h>
#include <algorithm>
#include <iostream>

#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/internal/file_map.h"
#include "types.h"

#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

namespace parlayANN {

template<class Point_>
struct PointRange{
  //using T = T_;
  using Point = Point_;
  using parameters = typename Point::parameters;
  using byte = uint8_t;

  long dimension() const {return params.dims;}
  //long aligned_dimension() const {return aligned_dims;}

  PointRange() : values(std::shared_ptr<byte[]>(nullptr, std::free)), n(0) {}

  template <typename PR>
  PointRange(const PR& pr, const parameters& p) : params(p)  {
    n = pr.size();
    int num_bytes = p.num_bytes();
    aligned_bytes = (num_bytes <= 32) ? 32 : 64 * ((num_bytes - 1)/64 + 1);
    long total_bytes = n * aligned_bytes;
    byte* ptr = (byte*) aligned_alloc(1l << 21, total_bytes);
    madvise(ptr, total_bytes, MADV_HUGEPAGE);
    values = std::shared_ptr<byte[]>(ptr, std::free);
    byte* vptr = values.get();
    parlay::parallel_for(0, n, [&] (long i) {
      Point::translate_point(vptr + i * aligned_bytes, pr[i], params);});
  }

  template <typename PR>
  PointRange (PR& pr) : PointRange(pr, Point::generate_parameters(pr)) { }

  template <typename PR>
  PointRange (PR& pr, int dims) : PointRange(pr, Point::generate_parameters(dims)) { }

  PointRange(char* filename) : values(std::shared_ptr<byte[]>(nullptr, std::free)){
      if(filename == NULL) {
        n = 0;
        return;
      }
      std::ifstream reader(filename);
      if (!reader.is_open()) {
        std::cout << "Data file " << filename << " not found" << std::endl;
        std::abort();
      }

      //read num points and max degree
      unsigned int num_points;
      unsigned int d;
      reader.read((char*)(&num_points), sizeof(unsigned int));
      n = num_points;
      reader.read((char*)(&d), sizeof(unsigned int));
      params = parameters(d);
      std::cout << "Data: detected " << num_points << " points with dimension " << d << std::endl;
      int num_bytes = params.num_bytes();
      aligned_bytes =  64 * ((num_bytes - 1)/64 + 1);
      if (aligned_bytes != num_bytes)
        std::cout << "Aligning bytes to " << aligned_bytes << std::endl;
      long total_bytes = n * aligned_bytes;
      byte* ptr = (byte*) aligned_alloc(1l << 21, total_bytes);
      madvise(ptr, total_bytes, MADV_HUGEPAGE);
      values = std::shared_ptr<byte[]>(ptr, std::free);
      size_t BLOCK_SIZE = 1000000;
      size_t index = 0;
      while(index < n) {
          size_t floor = index;
          size_t ceiling = index+BLOCK_SIZE <= n ? index+BLOCK_SIZE : n;
          long m = ceiling - floor;
          byte* data_start = new byte[m * num_bytes];
          reader.read((char*)(data_start), m * num_bytes);
          parlay::parallel_for(floor, ceiling, [&] (size_t i) {
            std::memmove(values.get() + i * aligned_bytes,
                         data_start + (i - floor) * num_bytes,
                         num_bytes);
          });
          delete[] data_start;
          index = ceiling;
      }
  }

  size_t size() const { return n; }

  unsigned int get_dims() const { return params.dims; }
  
  Point operator [] (long i) const {
    if (i > n) {
      std::cout << "ERROR: point index out of range: " << i << " from range " << n << ", " << std::endl;
      abort();
    }
    return Point(values.get()+i*aligned_bytes, i, params);
  }

  byte* location(long i) const {
    return values.get() + i * aligned_bytes;
  }
  
  parameters params;

private:
  std::shared_ptr<byte[]> values;
  long aligned_bytes;
  size_t n;
};

} // end namespace


================================================
FILE: algorithms/utils/rangeSearch.h
================================================
#include <algorithm>
#include <functional>
#include <random>
#include <set>
#include <unordered_set>  
#include <queue>

#include "parlay/io.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/random.h"
#include "beamSearch.h"
#include "earlyStopping.h"
#include "types.h"
#include "graph.h"
#include "stats.h"
#include "filtered_hashset.h"

namespace parlayANN {

  template<typename Point, typename PointRange, typename indexType>
  std::pair<std::vector<indexType>, long>
greedy_search(Point p, Graph<indexType> &G, PointRange &Points,
              std::vector<std::pair<indexType, typename Point::distanceType>> &starting_points,
              double radius) {
  std::vector<indexType> result;
  hashset<indexType> has_been_seen(2 * starting_points.size() * 64);
  long distance_comparisons = 0;

  for (auto [v,d] : starting_points) {
    if (has_been_seen(v) || d > radius) continue;
    result.push_back(v);
  }

  // now do a BFS over all vertices with distance less than radius
  long position = 0;
  std::vector<indexType> unseen;
  while (position < result.size()) {
    indexType next = result[position++];
    unseen.clear();
    for (long i = 0; i < G[next].size(); i++) {
      auto v = G[next][i];
      if (has_been_seen(v)) continue;
      unseen.push_back(v);
      Points[v].prefetch();
    }
    for (auto v : unseen) {
      distance_comparisons++;
      if (Points[v].distance(p) <= radius)
        result.push_back(v);
    }
  }

  return std::pair(std::move(result), distance_comparisons);
}

  // Does a priority-first search up to the radius given
  template<typename Point, typename PointRange, typename indexType>
  std::pair<std::vector<indexType>, long>
greedy_search_pq(Point p, Graph<indexType> &G, PointRange &Points,
                 std::vector<std::pair<indexType, typename Point::distanceType>> &starting_points,
                 double radius) {

  std::vector<indexType> result;
  hashset<indexType> has_been_seen(2 * starting_points.size() * 64);
  
  long distance_comparisons = 0;
  using did = std::pair<typename Point::distanceType, indexType>;
  auto cmp = [] (did a, did b) {return a.first > b.first;};
  std::priority_queue<did, std::vector<did>, decltype(cmp)> pq(cmp);

  for (auto [v,d] : starting_points) {
    if (has_been_seen(v)) continue;
    if (d > radius ) continue;
    pq.push(std::pair(d,v));
  }

  long position = 0;
  std::vector<indexType> unseen;
  while (pq.top().first <= radius) {
    auto nxt = pq.top().second;
    pq.pop();
    result.push_back(nxt);
    unseen.clear();
    for (long i = 0; i < G[nxt].size(); i++) {
      auto v = G[nxt][i];
      if (has_been_seen(v)) continue;
      unseen.push_back(v);
      Points[v].prefetch();
    }
    for (auto v : unseen) {
      distance_comparisons++;
      pq.push(std::pair(Points[v].distance(p), v));
    }
  }

  return std::pair(std::move(result), distance_comparisons);
}

  //a variant specialized for range searching
template<typename Point, typename PointRange, typename indexType>
std::pair<std::vector<indexType>, size_t>
greedy_search_old(Point p, Graph<indexType> &G, PointRange &Points,
                  parlay::sequence<std::pair<indexType, typename Point::distanceType>> &starting_points,
                  double radius,
                  parlay::sequence<std::pair<indexType, typename Point::distanceType>> &already_visited) {
  // compare two (node_id,distance) pairs, first by distance and then id if
  // equal
  using distanceType = typename Point::distanceType;
  auto less = [&](std::pair<indexType, distanceType> a, std::pair<indexType, distanceType> b) {
    return a.second < b.second || (a.second == b.second && a.first < b.first);
  };

  //need to use an unordered map for a dynamically sized hash table
  std::unordered_set<indexType> has_been_seen;

  //Insert everything from visited list into has_been_seen
  for(auto v : already_visited){
    if(!has_been_seen.count(v.first) > 0) has_been_seen.insert(v.first);
  }

  // Frontier maintains the points within radius found so far 
  // Each entry is a (id,distance) pair.
  // Initialized with starting points 
  std::queue<indexType> frontier;
  for (auto q : starting_points){
    if (!has_been_seen.count(q.first) > 0) has_been_seen.insert(q.first);
    frontier.push(q.first);
  }
  

  // maintains set of visited vertices (id-distance pairs)
  std::vector<indexType> visited;

  // counters
  size_t dist_cmps = starting_points.size();
  int remain = 1;
  int num_visited = 0;
  double total;

  // used as temporaries in the loop
  std::vector<indexType> keep;
  keep.reserve(G.max_degree());

  // The main loop.  Terminate beam search when the entire frontier
  // has been visited or have reached max_visit.
  while (frontier.size() > 0) {
    // the next node to visit is the unvisited frontier node that is closest to
    // p
    indexType current = frontier.front();
    frontier.pop();
    G[current].prefetch();
    // add to visited set
    visited.push_back(current);
    num_visited++;

    // keep neighbors that have not been visited (using approximate
    // hash). Note that if a visited node is accidentally kept due to
    // approximate hash it will be removed below by the union or will
    // not bump anyone else.
    keep.clear();
    for (indexType i=0; i<G[current].size(); i++) {
      auto a = G[current][i];
      //TODO this is a bug when searching for a point not in the graph???
      if (a == p.id() || has_been_seen.count(a) > 0) continue;  // skip if already seen
      keep.push_back(a);
      Points[a].prefetch();
      has_been_seen.insert(a);
    }

    for (auto a : keep) {
      distanceType dist = Points[a].distance(p);
      dist_cmps++;
      // filter out if not within radius
      if (dist > radius) continue;
      frontier.push(a);
    }
  }

  return std::make_pair(visited, dist_cmps);    
}

  template<typename Point, typename PointRange, typename QPointRange, typename indexType>
  std::pair<parlay::sequence<std::vector<indexType>>,std::pair<double,double>> 
RangeSearch(Graph<indexType> &G,
            PointRange &Query_Points, PointRange &Base_Points,
            QPointRange& Q_Query_Points, QPointRange &Q_Base_Points,
            stats<indexType> &QueryStats,
            indexType starting_point,
            QueryParams &QP) {

  parlay::sequence<indexType> starting_points = {starting_point};
  parlay::sequence<std::vector<indexType>> all_neighbors(Query_Points.size());
  parlay::WorkerSpecific<double> beam_time;
  parlay::WorkerSpecific<double> other_time;
  bool use_rerank = (Base_Points.params.num_bytes() != Q_Base_Points.params.num_bytes());
  parlay::parallel_for(0, Query_Points.size(), [&](size_t i) {
    parlay::internal::timer t_search_beam("beam search time");
    parlay::internal::timer t_search_other("other time");
    t_search_beam.stop();
    t_search_other.stop();
    std::vector<indexType> neighbors;
    std::vector<std::pair<indexType, typename Point::distanceType>> neighbors_with_distance;
    t_search_beam.start();
    using dtype = typename Point::distanceType;
    using id_dist = std::pair<indexType, dtype>;
    QueryParams QP1(QP.beamSize, QP.beamSize, G.size(), G.max_degree(),
                    QP.is_early_stop, Q_Query_Points[i].translate_distance(QP.early_stopping_radius),
                    QP.early_stopping_count,
                    QP.range_query_type, Q_Query_Points[i].translate_distance(QP.radius));

    auto [pairElts, dist_cmps_beam] =
      filtered_beam_search(G, Q_Query_Points[i], Q_Base_Points,
                           Q_Query_Points[i], Q_Base_Points,
                           starting_points, QP1, false,
                           early_stopping<std::vector<id_dist>>);
    t_search_beam.stop();
    auto [beamElts, visitedElts] = pairElts;
    for (auto b : beamElts) {
      double dist;
      if (use_rerank) {
        dist = Query_Points[i].distance(Base_Points[b.first]);
      } else {
        dist = b.second;
      }
      if (dist <= QP.radius) {
        neighbors.push_back(b.first);
        neighbors_with_distance.push_back(b);
      }
    }
    if (neighbors.size() < QP.beamSize || QP.range_query_type == Beam){
      all_neighbors[i] = std::move(neighbors);
    } else{
      // if using quantization then use slightly larger radius
      t_search_other.start();
      double pad_factor = (QP1.radius > 0) ? 1.05 : .975;
      double radius = use_rerank ? pad_factor * QP1.radius : QP1.radius;
      auto [in_range, dist_cmps_greedy] =
        greedy_search(Q_Query_Points[i], G, Q_Base_Points,
                      neighbors_with_distance, radius);

      std::vector<indexType> ans;

      //#define EndWithBeam
#ifdef EndWithBeam
      int beamSize = in_range.size() * 1.1;
      QueryParams QP2(beamSize, beamSize, G.size(), G.max_degree());
      auto [pairElts, dist_cmps2] = beam_search(Q_Query_Points[i], G, Q_Base_Points, in_range, QP2);
      for (auto r : pairElts.first) 
        if (Query_Points[i].distance(Base_Points[r.first]) <= QP.radius)
          ans.push_back(r.first);
#else
      for (auto r : in_range)
        if (!use_rerank || Query_Points[i].distance(Base_Points[r]) <= QP.radius)
          ans.push_back(r);
#endif
      all_neighbors[i] = std::move(ans);
      QueryStats.increment_visited(i, in_range.size());
      QueryStats.increment_dist(i, dist_cmps_greedy);
      t_search_other.stop();
    }
    

    *beam_time += t_search_beam.total_time();
    *other_time += t_search_other.total_time();
    QueryStats.increment_visited(i, visitedElts.size());
    QueryStats.increment_dist(i, dist_cmps_beam);
    
  });

  double total_time_beam = 0;
  double total_time_other = 0;
  for (auto x : beam_time) total_time_beam += x;
  for (auto y: other_time) total_time_other += y;
  return std::make_pair(all_neighbors,std::make_pair(total_time_beam,total_time_other));
}

}


================================================
FILE: algorithms/utils/simpleGraph.h
================================================
// This code is part of the Parlay Project
// Copyright (c) 2024 Guy Blelloch, Magdalen Dobson and the Parlay team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <algorithm>
#include <fcntl.h>
#include <iostream>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/internal/file_map.h"
#include "../../parlaylib/examples/helper/graph_utils.h"

#include "types.h"

namespace parlayANN {
  
template<typename indexType>
struct edgeRange{
  using graphUtils = graph_utils<indexType>;
  using Edges = typename graph_utils<indexType>::vertices;
  
  size_t size() const {return (*edges).size();}
  indexType id() const {return id_;}

  edgeRange() : edges(nullptr), id_(0) {}

  edgeRange(Edges* ngh, indexType maxDeg, indexType i)
    : edges(ngh), maxDeg(maxDeg), id_(i) {}

  indexType operator [] (indexType j) const {
    if (j > size()) {
      std::cout << "ERROR: index exceeds degree while accessing neighbors" << std::endl;
      abort();
    } else return (*edges)[j];
  }

  void append_neighbor(indexType nbh){
    (*edges).push_back(nbh);
  }

  template<typename rangeType>
  void update_neighbors(const rangeType& r){
    (*edges).clear();
    for (int i = 0; i < r.size(); i++) 
      (*edges).push_back(r[i]);
  }

  template<typename rangeType>
  void append_neighbors(const rangeType& r){
    for (int i = 0; i < r.size(); i++) 
      (*edges).push_back(r[i]);
  }

  void clear_neighbors(){
    (*edges).clear();
  }

  void prefetch() const {
    int l = (size() * sizeof(indexType))/64;
    for (int i = 0; i < l; i++)
      __builtin_prefetch(((char*) (*edges).data()) + i *  64);
  }

  template<typename F>
  void sort(F&& less){
    std::sort((*edges).begin(), (*edges).end(), less);}

  indexType* begin() const {return (*edges).data();}

  indexType* end() const {return (*edges).data() + size();}

private:
  Edges* edges;
  long maxDeg;
  indexType id_;
};

template<typename indexType_>
struct Graph{
  using indexType = indexType_;
  using graphUtils = graph_utils<indexType>;
  using Edges = typename graph_utils<indexType>::vertices;
  using gtype = typename graphUtils::graph;
  
  long max_degree() const {return maxDeg;}
  size_t size() const {return graph.size();}

  Graph(){}
  Graph(long maxDeg, size_t n) : maxDeg(maxDeg), graph(gtype(n)) {}

  Graph(char* gFile){
    std::string fname = gFile;
    graph = graphUtils::read_graph_from_file(fname);
    maxDeg = reduce(map(graph, parlay::size_of()), parlay::maximum<size_t>());
  }
  
  void save(char* oFile) {
    std::cout << "Writing graph with " << graph.size()
              << " points and max degree " << maxDeg
              << " to " << oFile 
              << std::endl;
    std::string fname = oFile;
    graphUtils::write_graph_to_file(graph, fname);
  }

  edgeRange<indexType> operator [] (indexType i) const {
    if (i > graph.size()) {
      std::cout << "ERROR: graph index out of range: " << i << std::endl;
      abort();
    }
    return edgeRange<indexType>((Edges*) &graph[i], maxDeg, i);
  }
  ~Graph(){}

private:
  indexType maxDeg;
  gtype graph;
};

} // end namespace


================================================
FILE: algorithms/utils/stats.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <algorithm>
#include <queue>
#include <set>

#include "graph.h"

#include "parlay/io.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "../../parlaylib/examples/BFS.h"
#include "graph_reorder.h"
#include "../../parlaylib/examples/helper/graph_utils.h"
#include "parlay/internal/get_time.h"

namespace parlayANN {

inline std::pair<double, int> graph_stats_(Graph<unsigned int> &G) {
  auto od = parlay::delayed_seq<size_t>(
      G.size(), [&](size_t i) { return G[i].size(); });
  size_t j = parlay::max_element(od) - od.begin();
  int maxDegree = od[j];
  size_t sum1 = parlay::reduce(od);
  double avg_deg = sum1 / ((double)G.size());
  return std::make_pair(avg_deg, maxDegree);
}

  template <typename indexType>
void print_graph_statistics(Graph<indexType> &G, indexType start) {
  long n = G.size();

  // convert to right format for transpose
  auto GG = parlay::tabulate(n, [&] (indexType i) {
                                  parlay::sequence<indexType> out;
                                  for (int x : G[i]) out.push_back(x);
                                  return out;
                                });
  // generate some statistics for the graph
  auto GTrans = graph_utils<indexType>::transpose(GG);
  auto inDegrees = parlay::map(GTrans, parlay::size_of());
  auto lowInDegrees = parlay::filter(parlay::iota(n), [&] (long i) {return inDegrees[i] < 4;});
  auto maxInDegree = *parlay::max_element(inDegrees);
  auto outDegrees = parlay::map(GG, parlay::size_of());
  auto maxOutDegree = *parlay::max_element(outDegrees);
  // for (auto u : lowDegrees) 
  //   for (auto v : G[u])
  //     G[v].append_neighbor(u);

  using etype = std::pair<int,int>;

  int psize = 20;
  auto Gprefix = parlay::map(GG, [&] (auto& ngh) {
                                   if (ngh.size() > psize)
                                     return parlay::tabulate(psize, [&] (int i) {return ngh[i];});
                                   else return ngh;});
    
  auto E = parlay::flatten(parlay::tabulate(G.size(), [&] (int u) {
                   return parlay::map_maybe(Gprefix[u], [=] (int v) {
                                                    if (u < v)
                                                      return std::optional<etype>(etype(u,v));
                                                    else return std::optional<etype>();});}));

  parlay::internal::timer t;
  auto result = graph_reorder(E, n);
  t.next("reorder time");
  //auto result = parlay::iota<int>(n);

  //auto ldiff = [] (int u, int v) {return std::log2(std::abs(u -v));};
  auto ldiff = [] (int u, int v) {
                 float diff = std::abs(u - v);
                 if (diff < 128) return 1;
                 if (diff < 16384) return 2;
                 else return 3;
               };
  auto vcost = [&] (int i) {
                 auto nghn = parlay::sort(parlay::map(GG[i], [&] (int v) {return result[v];}));
                 return ldiff(result[i],nghn[0]) +
                   parlay::reduce(parlay::tabulate(nghn.size()-1, [&] (int i) {return ldiff(nghn[i+1], nghn[i]);}));};
  float total_cost = parlay::reduce(parlay::tabulate(n, vcost));
  
  std::cout << "Bytes per edge: " << total_cost/parlay::reduce(parlay::map(GG, parlay::size_of())) << std::endl;

  auto layers = BFS(start, G);
  auto sizes = parlay::map(layers, parlay::size_of());
  auto visited = flatten(layers);
  
  std::cout << "Graph statistics:" << std::endl;
  //std::cout << "  average degree = " << float(parlay::reduce(GG, parlay::size_of()))/n << std::endl;
  //std::cout << "  max out-degree = " << *minDegreeLoc << std::endl;
  std::cout << "  max in-degree = " << maxInDegree << std::endl;
  std::cout << "  number with low (< 4) in-degree = " << lowInDegrees.size() << std::endl;
  std::cout << "  unreachable from source = " << (n - visited.size()) << "/" << n << std::endl;
  std::cout << "  radius from source = " << layers.size() << std::endl;
  std::cout << "  BFS level sizes = " << parlay::to_chars(sizes) << std::endl;
  }
  
template<typename indexType>
struct stats{

  stats() {}
  
  stats(size_t n){
    visited = parlay::sequence<indexType>(n, 0);
    distances = parlay::sequence<indexType>(n, 0);
  }

  parlay::sequence<indexType> visited;
  parlay::sequence<indexType> distances;

  void increment_dist(indexType i, indexType j){
    distances[i]+=j;}
  void increment_visited(indexType i, indexType j){
    visited[i]+=j;}

  parlay::sequence<indexType> visited_stats(){return statistics(this->visited);}
  parlay::sequence<indexType> dist_stats(){return statistics(this->distances);}

  void clear(){
    size_t n = visited.size();
    visited = parlay::sequence<indexType>(n, 0);
    distances = parlay::sequence<indexType>(n, 0);
  }

  static parlay::sequence<indexType> statistics(parlay::sequence<indexType> s){
    auto sl = parlay::map(s, [] (long x) { return x;});
    indexType avg = (indexType) (parlay::reduce(sl) / s.size());
    indexType tail = parlay::sort(s)[.99 * ((float)s.size())];
    auto result = {avg, tail};
    return result;
  }

};


// template <typename T>
// auto query_stats(parlay::sequence<Tvec_point<T> *> &q) {
//   parlay::sequence<size_t> vs = visited_stats(q);
//   parlay::sequence<size_t> ds = distance_stats(q);
//   auto result = {ds, vs};
//   return parlay::flatten(result);
// }

// template <typename T>
// auto range_query_stats(parlay::sequence<Tvec_point<T> *> &q) {
//   auto pred = [&](Tvec_point<T> *p) { return (p->ngh.size() == 0); };
//   auto pred1 = [&](Tvec_point<T> *p) { return !pred(p); };
//   auto zero_queries = parlay::filter(q, pred);
//   auto nonzero_queries = parlay::filter(q, pred1);
//   parlay::sequence<int> vn = visited_stats(nonzero_queries);
//   parlay::sequence<int> dn = distance_stats(nonzero_queries);
//   parlay::sequence<int> rn = rounds_stats(nonzero_queries);
//   parlay::sequence<int> vz = visited_stats(zero_queries);
//   parlay::sequence<int> dz = distance_stats(zero_queries);
//   parlay::sequence<int> rz = rounds_stats(zero_queries);
//   auto result = {rn, dn, vn, rz, dz, vz};
//   return parlay::flatten(result);
// }

// template <typename T>
// parlay::sequence<size_t> visited_stats(parlay::sequence<Tvec_point<T> *> &q) {
//   auto visited_stats =
//       parlay::tabulate(q.size(), [&](size_t i) { return q[i]->visited; });
//   parlay::sort_inplace(visited_stats);
//   size_t avg_visited = (int)parlay::reduce(visited_stats) / ((double)q.size());
//   size_t tail_index = .99 * ((float)q.size());
//   size_t tail_visited = visited_stats[tail_index];
//   auto result = {avg_visited, tail_visited};
//   return result;
// }

// template <typename T>
// parlay::sequence<size_t> distance_stats(parlay::sequence<Tvec_point<T> *> &q) {
//   auto dist_stats =
//       parlay::tabulate(q.size(), [&](size_t i) { return q[i]->dist_calls; });
//   parlay::sort_inplace(dist_stats);
//   size_t avg_dist = (size_t)parlay::reduce(dist_stats) / ((double)q.size());
//   size_t tail_index = .99 * ((float)q.size());
//   size_t tail_dist = dist_stats[tail_index];
//   auto result = {avg_dist, tail_dist};
//   return result;
// }

// template <typename T>
// parlay::sequence<size_t> rounds_stats(parlay::sequence<Tvec_point<T> *> &q) {
//   auto exp_stats =
//       parlay::tabulate(q.size(), [&](size_t i) { return q[i]->rounds; });
//   parlay::sort_inplace(exp_stats);
//   size_t avg_exps = (size_t)parlay::reduce(exp_stats) / ((double)q.size());
//   size_t tail_index = .99 * ((float)q.size());
//   size_t tail_exps = exp_stats[tail_index];
//   auto result = {avg_exps, tail_exps, exp_stats[exp_stats.size() - 1]};
//   return result;
// }

// void range_gt_stats(parlay::sequence<ivec_point> groundTruth) {
//   auto sizes = parlay::tabulate(groundTruth.size(), [&](size_t i) {
//     return groundTruth[i].coordinates.size();
//   });
//   parlay::sort_inplace(sizes);
//   size_t first_nonzero_index = 0;
//   for (size_t i = 0; i < sizes.size(); i++) {
//     if (sizes[i] != 0) {
//       first_nonzero_index = i;
//       break;
//     }
//   }
//   auto nonzero_sizes = (sizes).cut(first_nonzero_index, sizes.size());
//   auto sizes_sum = parlay::reduce(nonzero_sizes);
//   float avg =
//       static_cast<float>(sizes_sum) / static_cast<float>(nonzero_sizes.size());
//   std::cout << "Among nonzero entries, the average number of matches is " << avg
//             << std::endl;
//   std::cout << "25th percentile: " << nonzero_sizes[.25 * nonzero_sizes.size()]
//             << std::endl;
//   std::cout << "75th percentile: " << nonzero_sizes[.75 * nonzero_sizes.size()]
//             << std::endl;
//   std::cout << "99th percentile: " << nonzero_sizes[.99 * nonzero_sizes.size()]
//             << std::endl;
//   std::cout << "Max: " << nonzero_sizes[nonzero_sizes.size() - 1] << std::endl;
// }

// template <typename T>
// int connected_components(parlay::sequence<Tvec_point<T> *> &v) {
//   parlay::sequence<bool> visited(v.size(), false);
//   int cc = 0;
//   for (int i = 0; i < v.size(); i++) {
//     if (!visited[i]) {
//       BFS(i, v, visited);
//       cc++;
//     }
//   }
//   return cc;
// }

// template <typename T>
// void BFS(int start, parlay::sequence<Tvec_point<T> *> &v,
//          parlay::sequence<bool> &visited) {
//   std::queue<int> frontier;
//   frontier.push(start);
//   while (frontier.size() != 0) {
//     int c = frontier.front();
//     frontier.pop();
//     visited[c] = true;
//     for (int l = 0; l < size_of(v[c]->out_nbh); l++) {
//       int j = v[c]->out_nbh[l];
//       if (!visited[j]) frontier.push(j);
//     }
//   }
// }

} // end namespace


================================================
FILE: algorithms/utils/types.h
================================================
#ifndef ALGORITHMS_ANN_TYPES_H_
#define ALGORITHMS_ANN_TYPES_H_

// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#ifndef TYPES
#define TYPES

#include <algorithm>
#include <fstream>

#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "mmap.h"

namespace parlayANN {

template<typename T>
struct groundTruth{
  parlay::slice<T*, T*> coords;
  parlay::slice<float*, float*> dists;
  long dim;
  size_t n;

  groundTruth() : coords(parlay::make_slice<T*, T*>(nullptr, nullptr)),
                  dists(parlay::make_slice<float*, float*>(nullptr, nullptr)){}

  groundTruth(char* gtFile) : coords(parlay::make_slice<T*, T*>(nullptr, nullptr)),
                              dists(parlay::make_slice<float*, float*>(nullptr, nullptr)){
    if(gtFile == NULL){
      n = 0;
      dim = 0;
    } else{
      auto [fileptr, length] = mmapStringFromFile(gtFile);

      int num_vectors = *((T*) fileptr);
      int d = *((T*) (fileptr + 4));

      
      std::cout << "Ground truth: detected " << num_vectors << " points with num results " << d << std::endl;

      T* start_coords = (T*)(fileptr+8);
      T* end_coords = start_coords + d*num_vectors;

      float* start_dists = (float*)(end_coords);
      float* end_dists = start_dists + d*num_vectors;

      n = num_vectors;
      dim = d;
      coords = parlay::make_slice(start_coords, end_coords);
      dists = parlay::make_slice(start_dists, end_dists);
    }
  }

  groundTruth(parlay::sequence<parlay::sequence<T>> gt) : coords(parlay::make_slice<T*, T*>(nullptr, nullptr)),
                                                          dists(parlay::make_slice<float*, float*>(nullptr, nullptr)){
    n = gt.size();
    dim = gt[0].size();
    auto flat_gt = parlay::flatten(gt);
    coords = parlay::make_slice(flat_gt.begin(), flat_gt.end());
    parlay::sequence<float> dummy_ds = parlay::sequence<float>(dim * n, 0.0);
    dists = parlay::make_slice(dummy_ds.begin(), dummy_ds.end());
  }

  //saves in binary format
  //assumes gt is not so big that it needs block saving
  void save(char* save_path) {
    std::cout << "Writing groundtruth for " << n << " points and num results " << dim
              << std::endl;
    parlay::sequence<T> preamble = {static_cast<T>(n), static_cast<T>(dim)};
    std::ofstream writer;
    writer.open(save_path, std::ios::binary | std::ios::out);
    writer.write((char*)preamble.begin(), 2 * sizeof(T));
    writer.write((char*)coords.begin(), dim*n*sizeof(T));
    writer.write((char*)dists.begin(), dim*n*sizeof(float));
    writer.close();
  }

  T coordinates(long i, long j) const {return *(coords.begin() + i * dim + j);}

  float distances(long i, long j) const {return *(dists.begin() + i * dim + j);}

  size_t size() const {return n;}

  long dimension() const {return dim;}

};

template<typename T>
struct RangeGroundTruth{
  T* coords;
  parlay::sequence<T> offsets;
  parlay::slice<T*, T*> sizes;
  size_t n;
  size_t num_matches;

  RangeGroundTruth() : sizes(parlay::make_slice<T*, T*>(nullptr, nullptr)){}

  RangeGroundTruth(char* gtFile) : sizes(parlay::make_slice<T*, T*>(nullptr, nullptr)){
    if(gtFile == NULL){
      n = 0;
      num_matches = 0;
    } else{
      auto [fileptr, length] = mmapStringFromFile(gtFile);

      n = *((T*) fileptr);
      num_matches = *((T*) (fileptr + sizeof(T)));

      T* sizes_begin = (T*)(fileptr + 2 * sizeof(T)) ;
      T* sizes_end = sizes_begin+n;
      sizes = parlay::make_slice(sizes_begin, sizes_end);

      auto [offsets0, total] = parlay::scan(sizes);
      offsets0.push_back(total);
      offsets = offsets0;

      std::cout << "Detected " << n << " points with num matches " << num_matches << std::endl;

      coords = sizes_end;
    }
  }

  parlay::slice<T*, T*> operator[] (long i){
    T* begin = coords + offsets[i];
    T* end = coords + offsets[i + 1];
    return parlay::make_slice(begin, end);
  }

  size_t size(){return n;}
  size_t matches(){return num_matches;}
};

enum rangeQueryType {None, Greedy, Doubling, Beam};
  
struct BuildParams{
  long R; //vamana and pynnDescent
  long L; //vamana
  double m_l = 0; // HNSW
  double alpha; //vamana and pyNNDescent
  int num_passes; //vamana

  long num_clusters; // HCNNG and pyNNDescent
  long cluster_size; //HCNNG and pyNNDescent
  long MST_deg; //HCNNG

  double delta; //pyNNDescent
  bool verbose = false;
  bool graph_stats = false;

  int quantize = 0; // use quantization for build and query (0 = none, 1 = one-level, 2 = two-level)
  bool self;
  int single_batch; //vamana
  long Q = 0; //beam width to pass onto query (0 indicates none specified)
  double trim = 0.0; // for quantization
  double rerank_factor = 100; // for reranking, k * factor = to rerank
  double batch_factor = 1.0;
  bool is_early_stop = false;
  double early_stopping_radius; // for radius search
  rangeQueryType range_query_type = None;
  double radius; // for radius search

  std::string alg_type;


  BuildParams(long R, long L, double a, int num_passes,
              long nc, long cs, long mst, double de,
              bool verbose = false, int quantize = 0, 
              bool self = false, int single_batch = 0,
              long Q = 0, double trim = 0.0,
              double rerank_factor = 100, double batch_factor = 1.0,
              bool is_early_stop = false, double early_stopping_radius = 0.0, 
              rangeQueryType range_query_type = None, double radius = 0.0,
              bool graph_stats = false) 
    : R(R), L(L), alpha(a), num_passes(num_passes), num_clusters(nc),
      cluster_size(cs), MST_deg(mst), delta(de),
      verbose(verbose), graph_stats(graph_stats), quantize(quantize),
      self(self), single_batch(single_batch),
      Q(Q), trim(trim),
      rerank_factor(rerank_factor), batch_factor(batch_factor),
      is_early_stop(is_early_stop), early_stopping_radius(early_stopping_radius),
      range_query_type(range_query_type), radius(radius) {
    if(R != 0 && L != 0 && alpha != 0){alg_type = m_l>0? "HNSW": "Vamana";}
    else if(num_clusters != 0 && cluster_size != 0 && MST_deg != 0){alg_type = "HCNNG";}
    else if(R != 0 && alpha != 0 && num_clusters != 0 && cluster_size != 0 && delta != 0){alg_type = "pyNNDescent";}
  }

  BuildParams() {}

  BuildParams(long R, long L, double a, int num_passes, bool verbose = false)
    : R(R), L(L), alpha(a), num_passes(num_passes), verbose(verbose), single_batch(0)
  {alg_type = "Vamana";}

  BuildParams(long R, long L, double m_l, double a)
    : R(R), L(L), m_l(m_l), alpha(a), verbose(false)
  {alg_type = "HNSW";}

  BuildParams(long nc, long cs, long mst)
    : num_clusters(nc), cluster_size(cs), MST_deg(mst), verbose(false)
  {alg_type = "HCNNG";}

  BuildParams(long R, double a, long nc, long cs, double de)
    : R(R), alpha(a), num_clusters(nc), cluster_size(cs), delta(de), verbose(false)
  {alg_type = "pyNNDescent";}

  long max_degree(){
    if(alg_type == "HCNNG") return num_clusters*MST_deg;
    else if(alg_type == "HNSW")  return R*2;
    else return R;
  }
};


struct QueryParams{
  long k;
  long beamSize;
  long limit;
  long degree_limit;
  double rerank_factor = 100;
  double batch_factor = .125;
  bool is_early_stop = false;
  double early_stopping_radius;
  double early_stopping_count;
  rangeQueryType range_query_type = None;
  double radius;

  float pad = 1.0;

  QueryParams(long k, long Q, long limit, long dg,
              double rerank_factor = 100,
              double batch_factor = .125)
    : k(k), beamSize(Q), limit(limit), degree_limit(dg),
      rerank_factor(rerank_factor), batch_factor(batch_factor) {}

  QueryParams(long k, long Q, long limit, long dg,
              long es, double esr, long esc,
              rangeQueryType range_query_type, double radius)
    : k(k), beamSize(Q), limit(limit), degree_limit(dg),
      is_early_stop(es), early_stopping_radius(esr), early_stopping_count(esc),
      range_query_type(range_query_type), radius(radius) {}

  QueryParams() {}

  void print(){
    std::cout << "Beam: " << beamSize;
  }


};

template<typename T, typename Point>
class Desc_HNSW{
public:
  typedef T type_elem;
  typedef Point type_point;
  static auto distance(const type_point &u, const type_point &v, uint32_t dim)
  {
    (void)dim;
    return u.distance(v);
  }

  static auto get_id(const type_point &u)
  {
    return u.id();
  }
};

#endif
} // end namespace

#endif // ALGORITHMS_ANN_TYPES_H_


================================================
FILE: algorithms/utils/union.h
================================================
#ifndef ALGORITHMS_ANN_UNION_
#define ALGORITHMS_ANN_UNION_

#include <set>
#include "parlay/parallel.h"
#include "parlay/primitives.h"

namespace parlayANN {

// takes in two sorted sequences and returns a sorted union
// of length at most K, with a bool denoting whether P has changed
template <typename F, typename T>
std::pair<parlay::sequence<T>, bool> seq_union_bounded(
    parlay::sequence<T>& P, parlay::sequence<T>& Q, int K, F&& less) {
  T* first1 = P.begin();
  T* last1 = P.end();
  T* first2 = Q.begin();
  T* last2 = Q.end();
  bool changed = false;
  parlay::sequence<T> result = parlay::sequence<T>();
  result.reserve(K);
  int count = 0;
  while (true && count < K) {
    if (first1 == last1) {
      while (first2 != last2 && count < K) {
        changed = true;
        result.push_back(*first2);
        count++;
        ++first2;
      }
      return std::make_pair(result, changed);
    } else if (first2 == last2) {
      while (first1 != last1 && count < K) {
        result.push_back(*first1);
        count++;
        ++first1;
      }
      return std::make_pair(result, changed);
    }
    if (less(*first1, *first2)) {
      result.push_back(*first1);
      count++;
      ++first1;
    } else if (less(*first2, *first1)) {
      result.push_back(*first2);
      changed = true;
      count++;
      ++first2;
    } else {
      if (first1->first == first2->first) {
        result.push_back(*first1);
        count++;
        ++first1;
        ++first2;
      } else {
        result.push_back(*first1);
        count++;
        if (count == K)
          break;
        else {
          result.push_back(*first2);
          changed = true;
          count++;
          ++first1;
          ++first2;
        }
      }
    }
  }
  return std::make_pair(result, changed);
}

// takes in two sorted sequences and returns a sorted union
template <typename F, typename T>
parlay::sequence<T> seq_union(parlay::sequence<T>& P, parlay::sequence<T>& Q, F&& less) {
  T* first1 = P.begin();
  T* last1 = P.end();
  T* first2 = Q.begin();
  T* last2 = Q.end();
  parlay::sequence<T> result = parlay::sequence<T>();
  result.reserve(P.size() + Q.size());
  while (true) {
    if (first1 == last1) {
      while (first2 != last2) {
        result.push_back(*first2);
        ++first2;
      }
      return result;
    } else if (first2 == last2) {
      while (first1 != last1) {
        result.push_back(*first1);
        ++first1;
      }
      return result;
    }
    if (less(*first1, *first2)) {
      result.push_back(*first1);
      ++first1;
    } else if (less(*first2, *first1)) {
      result.push_back(*first2);
      ++first2;
    } else {
      if (first1->first == first2->first) {
        result.push_back(*first1);
        ++first1;
        ++first2;
      } else {
        result.push_back(*first1);
        result.push_back(*first2);
        ++first1;
        ++first2;
      }
    }
  }
  return result;
}

} // end namespace

#endif // ALGORITHMS_ANN_UNION_


================================================
FILE: algorithms/vamana/BUILD
================================================
# Vamana algorithm.

cc_library(
    name = "index",
    hdrs = ["index.h"],
    deps = [
        "@parlaylib//parlay:delayed",
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
        "@parlaylib//parlay:random",
        "//algorithms/utils:graph",
        "//algorithms/utils:beamSearch",
        "//algorithms/utils:types",
        "//algorithms/utils:point_range",
    ],
)

cc_test(
    name = "index_test",
    size = "small",
    srcs = ["index_test.cc"],
    deps = [
        "@googletest//:gtest_main",
        ":index",
        "//algorithms/utils:point_range",
        "//algorithms/utils:types",
        "//algorithms/utils:mmap",
        "//algorithms/utils:graph",
        "//algorithms/utils:beamSearch",
        "//algorithms/utils:stats",
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
        "@parlaylib//parlay:random",
        "@parlaylib//parlay:delayed",
    ],
)

cc_library(
    name = "neighbors",
    hdrs = ["neighbors.h"],
    deps = [
        ":index",
        "@parlaylib//parlay:parallel",
        "@parlaylib//parlay:primitives",
        "@parlaylib//parlay:random",
        "//algorithms/utils:beamSearch",
        "//algorithms/utils:check_nn_recall",
        "//algorithms/utils:csvfile",
        "//algorithms/utils:parse_results",
        "//algorithms/utils:graph",
        "//algorithms/utils:jl_point",
        "//algorithms/utils:stats",
        "//algorithms/utils:types",
        "//algorithms/utils:point_range",
        "//algorithms/utils:euclidean_point",
        "//algorithms/utils:mips_point",
    ],
)

cc_test(
    name = "neighbors_test",
    size = "small",
    srcs = ["neighbors_test.cc"],
    deps = [
        "@googletest//:gtest_main",
        ":neighbors",
    ],
)


================================================
FILE: algorithms/vamana/CMakeLists.txt
================================================
function(add_neighbors NAME DEFS)
  add_executable(${NAME} ../bench/neighborsTime.C)
  target_link_libraries(${NAME} PRIVATE parlay)
  target_compile_definitions(${NAME} PRIVATE ${DEFS})
  target_precompile_headers(${NAME} PRIVATE neighbors.h)
  target_include_directories(${NAME} PRIVATE ${PARLAY_INCLUDE_DIR})
endfunction()

add_neighbors(neighbors "")


================================================
FILE: algorithms/vamana/Makefile
================================================
include ../bench/parallelDefsANN

REQUIRE = ../utils/beamSearch.h index.h  ../utils/check_nn_recall.h ../utils/NSGDist.h ../utils/parse_results.h ../utils/graph.h ../utils/point_range.h ../utils/euclidian_point.h ../utils/mips_point.h ../utils/jl_point.h ../utils/hashset.h ../utils/types.h
BENCH = neighbors

include ../bench/MakeBench


================================================
FILE: algorithms/vamana/index.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <math.h>

#include <algorithm>
#include <random>
#include <set>

#include "../utils/point_range.h"
#include "../utils/graph.h"
#include "../utils/types.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/delayed.h"
#include "parlay/random.h"
#include "../utils/beamSearch.h"

namespace parlayANN {

template<typename PointRange, typename QPointRange, typename indexType>
struct knn_index {
  using Point = typename PointRange::Point;
  using QPoint = typename QPointRange::Point;
  using distanceType = typename Point::distanceType;
  using pid = std::pair<indexType, distanceType>;
  using PR = PointRange;
  using QPR = QPointRange;
  using GraphI = Graph<indexType>;

  BuildParams BP;
  std::set<indexType> delete_set;
  indexType start_point;

  knn_index(BuildParams &BP) : BP(BP) {}

  indexType get_start() { return start_point; }

  //robustPrune routine as found in DiskANN paper, with the exception
  //that the new candidate set is added to the field new_nbhs instead
  //of directly replacing the out_nbh of p
  std::pair<parlay::sequence<indexType>, long>
  robustPrune(indexType p, parlay::sequence<pid>& cand,
              GraphI &G, PR &Points, double alpha, bool add = true) {
    // add out neighbors of p to the candidate set.
    size_t out_size = G[p].size();
    std::vector<pid> candidates;
    long distance_comps = 0;
    for (auto x : cand) candidates.push_back(x);

    if(add){
      for (size_t i=0; i<out_size; i++) {
        distance_comps++;
        candidates.push_back(std::make_pair(G[p][i], Points[G[p][i]].distance(Points[p])));
      }
    }

    // Sort the candidate set according to distance from p
    auto less = [&](std::pair<indexType, distanceType> a, std::pair<indexType, distanceType> b) {
      return a.second < b.second || (a.second == b.second && a.first < b.first);
    };
    std::sort(candidates.begin(), candidates.end(), less);

    // remove any duplicates
    auto new_end =std::unique(candidates.begin(), candidates.end(),
			      [&] (auto x, auto y) {return x.first == y.first;});
    candidates = std::vector(candidates.begin(), new_end);

    std::vector<indexType> new_nbhs;
    new_nbhs.reserve(BP.R);

    size_t candidate_idx = 0;

    while (new_nbhs.size() < BP.R && candidate_idx < candidates.size()) {
      // Don't need to do modifications.
      int p_star = candidates[candidate_idx].first;
      candidate_idx++;
      if (p_star == p || p_star == -1) {
        continue;
      }

      new_nbhs.push_back(p_star);

      for (size_t i = candidate_idx; i < candidates.size(); i++) {
        int p_prime = candidates[i].first;
        if (p_prime != -1) {
          distance_comps++;
          distanceType dist_starprime = Points[p_star].distance(Points[p_prime]);
          distanceType dist_pprime = candidates[i].second;
          if (alpha * dist_starprime <= dist_pprime) {
            candidates[i].first = -1;
          }
        }
      }
    }

    auto new_neighbors_seq = parlay::to_sequence(new_nbhs);
    return std::pair(new_neighbors_seq, distance_comps);
  }

  //wrapper to allow calling robustPrune on a sequence of candidates
  //that do not come with precomputed distances
  std::pair<parlay::sequence<indexType>, long>
  robustPrune(indexType p, parlay::sequence<indexType> candidates,
              GraphI &G, PR &Points, double alpha, bool add = true){

    parlay::sequence<pid> cc;
    long distance_comps = 0;
    cc.reserve(candidates.size()); // + size_of(p->out_nbh));
    for (size_t i=0; i<candidates.size(); ++i) {
      distance_comps++;
      cc.push_back(std::make_pair(candidates[i], Points[candidates[i]].distance(Points[p])));
    }
    auto [ngh_seq, dc] = robustPrune(p, cc, G, Points, alpha, add);
    return std::pair(ngh_seq, dc + distance_comps);
  }

  // add ngh to candidates without adding any repeats
  template<typename rangeType1, typename rangeType2>
  void add_neighbors_without_repeats(const rangeType1 &ngh, rangeType2& candidates) {
    std::unordered_set<indexType> a;
    for (auto c : candidates) a.insert(c);
    for (int i=0; i < ngh.size(); i++)
      if (a.count(ngh[i]) == 0) candidates.push_back(ngh[i]);
  }

  void set_start(){start_point = 0;}

  void build_index(GraphI &G, PR &Points, QPR &QPoints,
                   stats<indexType> &BuildStats, bool sort_neighbors = true){
    std::cout << "Building graph..." << std::endl;
    set_start();
    parlay::sequence<indexType> inserts = parlay::tabulate(Points.size(), [&] (size_t i){
      return static_cast<indexType>(i);});
    if (BP.single_batch != 0) {
      int degree = BP.single_batch;
      std::cout << "Using single batch per round with " << degree << " random start edges" << std::endl;
      parlay::random_generator gen;
      std::uniform_int_distribution<long> dis(0, G.size());
      parlay::parallel_for(0, G.size(), [&] (long i) {
        std::vector<indexType> outEdges(degree);
        for (int j = 0; j < degree; j++) {
          auto r = gen[i*degree + j];
          outEdges[j] = dis(r);
        }
        G[i].update_neighbors(outEdges);
      });
    }

    // last pass uses alpha
    std::cout << "number of passes = " << BP.num_passes << std::endl;
    for (int i=0; i < BP.num_passes; i++) {
      if (i == BP.num_passes - 1)
        batch_insert(inserts, G, Points, QPoints, BuildStats, BP.alpha, true, 2, .02);
      else
        batch_insert(inserts, G, Points, QPoints, BuildStats, 1.0, true, 2, .02);
    }

    if (sort_neighbors) {
      parlay::parallel_for (0, G.size(), [&] (long i) {
        auto less = [&] (indexType j, indexType k) {
          return Points[i].distance(Points[j]) < Points[i].distance(Points[k]);};
        G[i].sort(less);});
    }
  }

  void batch_insert(parlay::sequence<indexType> &inserts,
                    GraphI &G, PR &Points, QPR &QPoints,
                    stats<indexType> &BuildStats, double alpha,
                    bool random_order = false, double base = 2,
                    double max_fraction = .02, bool print=true) {
    for(int p : inserts){
      if(p < 0 || p > (int) G.size()){
        std::cout << "ERROR: invalid point "
                  << p << " given to batch_insert" << std::endl;
        abort();
      }
    }
    size_t n = G.size();
    size_t m = inserts.size();
    size_t inc = 0;
    size_t count = 0;
    float frac = 0.0;
    float progress_inc = .1;
    size_t max_batch_size = std::min(static_cast<size_t>(max_fraction * static_cast<float>(n)),
                                     1000000ul);
    //fix bug where max batch size could be set to zero
    if(max_batch_size == 0) max_batch_size = n;
    parlay::sequence<int> rperm;
    if (random_order) 
      rperm = parlay::random_permutation<int>(static_cast<int>(m));
    else
      rperm = parlay::tabulate(m, [&](int i) { return i; });
    auto shuffled_inserts =
      parlay::tabulate(m, [&](size_t i) { return inserts[rperm[i]]; });
    parlay::internal::timer t_beam("beam search time");
    parlay::internal::timer t_bidirect("bidirect time");
    parlay::internal::timer t_prune("prune time");
    t_beam.stop();
    t_bidirect.stop();
    t_prune.stop();
    while (count < m) {
      size_t floor;
      size_t ceiling;
      if (pow(base, inc) <= max_batch_size) {
        floor = static_cast<size_t>(pow(base, inc)) - 1;
        ceiling = std::min(static_cast<size_t>(pow(base, inc + 1)) - 1, m);
        count = std::min(static_cast<size_t>(pow(base, inc + 1)) - 1, m);
      } else {
        floor = count;
        ceiling = std::min(count + static_cast<size_t>(max_batch_size), m);
        count += static_cast<size_t>(max_batch_size);
      }

      if (BP.single_batch != 0) {
        floor = 0;
        ceiling = m;
        count = m;
      }

      parlay::sequence<parlay::sequence<indexType>> new_out_(ceiling-floor);
      // search for each node starting from the start_point, then call
      // robustPrune with the visited list as its candidate set
      t_beam.start();

      parlay::parallel_for(floor, ceiling, [&](size_t i) {
        size_t index = shuffled_inserts[i];
        indexType sp = BP.single_batch ? i : start_point;
        parlay::sequence<indexType> starting_points = {sp};
        QueryParams QP(0, BP.L, (long) Points.size(), (long) G.max_degree(),
                       BP.rerank_factor, BP.batch_factor);
        bool use_filtering = (Points.params.num_bytes() != QPoints.params.num_bytes());
        auto r = filtered_beam_search(G,
                                      Points[index],
                                      Points,
                                      QPoints[index],
                                      QPoints,
                                      starting_points,
                                      QP,
                                      use_filtering);
        auto visited = r.first.second;
        BuildStats.increment_dist(index, r.second);
        BuildStats.increment_visited(index, visited.size());

        long rp_distance_comps;
        std::tie(new_out_[i-floor], rp_distance_comps) = robustPrune(index, visited, G, Points, alpha);
        BuildStats.increment_dist(index, rp_distance_comps);
      });

      parlay::parallel_for(floor, ceiling, [&](size_t i) {
        G[shuffled_inserts[i]].update_neighbors(new_out_[i-floor]);
      });

      t_beam.stop();

      // make each edge bidirectional by first adding each new edge
      //(i,j) to a sequence, then semisorting the sequence by key values
      t_bidirect.start();

      auto flattened = parlay::delayed::flatten(parlay::tabulate(ceiling - floor, [&](size_t i) {
        indexType index = shuffled_inserts[i + floor];
        return parlay::delayed::map(new_out_[i], [=] (indexType ngh) {
          return std::pair(ngh, index);});}));
      auto grouped_by = parlay::group_by_key(parlay::delayed::to_sequence(flattened));

      t_bidirect.stop();
      t_prune.start();
      // finally, add the bidirectional edges; if they do not make
      // the vertex exceed the degree bound, just add them to out_nbhs;
      // otherwise, use robustPrune on the vertex with user-specified alpha
      parlay::parallel_for(0, grouped_by.size(), [&](size_t j) {
        auto &[index, candidates] = grouped_by[j];
	size_t newsize = candidates.size() + G[index].size();
        if (newsize <= BP.R) {
	  add_neighbors_without_repeats(G[index], candidates);
	  G[index].update_neighbors(candidates);
        } else {
          auto [new_out_2_, distance_comps] = robustPrune(index, std::move(candidates), G, Points, alpha);
	  G[index].update_neighbors(new_out_2_);
          BuildStats.increment_dist(index, distance_comps);
        }
      });
      t_prune.stop();

      if (print && BP.single_batch == 0) {
        auto ind = frac * n;
        if (floor <= ind && ceiling > ind) {
          frac += progress_inc;
          std::cout << "Pass " << 100 * frac << "% complete"
                    << std::endl;
        }
      }
      inc += 1;
    }
    t_beam.total();
    t_bidirect.total();
    t_prune.total();
  }

};

} // end namespace


================================================
FILE: algorithms/vamana/index_test.cc
================================================
#include "algorithms/vamana/index.h"

#include <gtest/gtest.h>

TEST(PlaceHolderTest, BuildPlaceHolder) { EXPECT_EQ(7 * 6, 42); }


================================================
FILE: algorithms/vamana/neighbors.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include <algorithm>

#include "../utils/beamSearch.h"
#include "../utils/check_nn_recall.h"
#include "../utils/parse_results.h"
#include "../utils/mips_point.h"
#include "../utils/euclidian_point.h"
#include "../utils/jl_point.h"
#include "../utils/stats.h"
#include "../utils/types.h"
#include "../utils/graph.h"
#include "index.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/random.h"

namespace parlayANN {

template<typename PointRange, typename QPointRange, typename QQPointRange, typename indexType>
void ANN_Quantized(Graph<indexType> &G, long k, BuildParams &BP,
                   PointRange &Query_Points, QPointRange &Q_Query_Points, QQPointRange &QQ_Query_Points,
                   groundTruth<indexType> GT, char *res_file,
                   bool graph_built,
                   PointRange &Points, QPointRange &Q_Points, QQPointRange &QQ_Points) {
  parlay::internal::timer t("ANN");

  bool verbose = BP.verbose;
  using findex = knn_index<QPointRange, QQPointRange, indexType>;
  findex I(BP);
  indexType start_point;
  double idx_time;
  stats<unsigned int> BuildStats(G.size());
  if(graph_built){
    idx_time = 0;
    start_point = 0;
  } else{
    I.build_index(G, Q_Points, QQ_Points, BuildStats);
    start_point = 0; // I.get_start();
    idx_time = t.next_time();
  }
  if (BP.graph_stats)
    print_graph_statistics(G, start_point);
    
  std::string name = "Vamana";
  std::string params =
    "R = " + std::to_string(BP.R) + ", L = " + std::to_string(BP.L);
  auto [avg_deg, max_deg] = graph_stats_(G);
  auto vv = BuildStats.visited_stats();
  std::cout << "Average visited: " << vv[0] << ", Tail visited: " << vv[1]
            << std::endl;
  Graph_ G_(name, params, G.size(), avg_deg, max_deg, idx_time);
  G_.print();
  
  long build_num_distances = parlay::reduce(parlay::map(BuildStats.distances,
                                                        [] (auto x) {return (long) x;}));

  if(Query_Points.size() != 0) {
    search_and_parse(G_, G,
                     Points, Query_Points,
                     Q_Points, Q_Query_Points,
                     QQ_Points, QQ_Query_Points,
                     GT,
                     res_file, k, false, start_point,
                     verbose, BP.Q, BP.rerank_factor, BP.batch_factor);
  }
}

template<typename Point, typename PointRange_, typename indexType>
void ANN(Graph<indexType> &G, long k, BuildParams &BP,
         PointRange_ &Query_Points,
         groundTruth<indexType> GT, char *res_file,
         bool graph_built, PointRange_ &Points) {
  if (BP.quantize != 0) {
    std::cout << "quantizing build and first pass of search to 1 byte" << std::endl;
    if constexpr (Point::is_metric) {
      using QT = uint8_t;
      using QPoint = Euclidian_Point<QT>;
      using QPR = PointRange<QPoint>;
      QPR Q_Points(Points);  // quantized to one byte
      QPR Q_Query_Points(Query_Points, Q_Points.params);
      if (BP.quantize == 1) {
         ANN_Quantized(G, k, BP, Query_Points, Q_Query_Points, Q_Query_Points,
                       GT, res_file, graph_built, Points, Q_Points, Q_Points);
      }
      // } else if (BP.quantize == 2) {
      //   using QQPoint = Euclidean_Bit_Point;
      //   using QQPR = PointRange<QQPoint>;
      //   QQPR QQ_Points(Points);
      //   QQPR QQ_Query_Points(Query_Points, QQ_Points.params);
      //   ANN_Quantized(G, k, BP, Query_Points, Q_Query_Points, QQ_Query_Points,
      //                 GT, res_file, graph_built, Points, Q_Points, QQ_Points);
      else if (BP.quantize == 3) {
        using QQPoint = Euclidean_JL_Sparse_Point<1024>;
        using QQPR = PointRange<QQPoint>;
        QQPR QQ_Points(Points);
        QQPR QQ_Query_Points(Query_Points, QQ_Points.params);
        ANN_Quantized(G, k, BP, Query_Points, Q_Query_Points, QQ_Query_Points,
                      GT, res_file, graph_built, Points, Q_Points, QQ_Points);
      }
    } else {
      std::cout << "hello" << std::endl;
      using QT = int8_t;
      //using QPoint = Euclidian_Point<uint8_t>;
      using QPoint = Quantized_Mips_Point<8,true,255>;
      using QPR = PointRange<QPoint>;
      QPR Q_Points(Points);
      QPR Q_Query_Points(Query_Points, Q_Points.params);
      if (BP.quantize == 1) {
        ANN_Quantized(G, k, BP, Query_Points, Q_Query_Points, Q_Query_Points,
                      GT, res_file, graph_built, Points, Q_Points, Q_Points);
      // } else if (BP.quantize == 2) {
      //   using QQPoint = Mips_Bit_Point;
      //   using QQPR = PointRange<QQPoint>;
      //   QQPR QQ_Points(Points);
      //   QQPR QQ_Query_Points(Query_Points, QQ_Points.params);
      //   ANN_Quantized(G, k, BP, Query_Points, Q_Query_Points, QQ_Query_Points,
      //                 GT, res_file, graph_built, Points, Q_Points, QQ_Points);
      // } else if (BP.quantize == 3) {
      //   using QQPoint = Mips_2Bit_Point;
      //   using QQPR = PointRange<QQPoint>;
      //   QQPR QQ_Points(Points);
      //   QQPR QQ_Query_Points(Query_Points, QQ_Points.params);
      //   ANN_Quantized(G, k, BP, Query_Points, Q_Query_Points, QQ_Query_Points,
      //                 GT, res_file, graph_built, Points, Q_Points, QQ_Points);
      // } else if (BP.quantize == 4) {
      //   using QQPoint = Mips_JL_Sparse_Point<512>;
      //   //using QQPoint = Mips_JL_Bit_Point<512>;
      //   using QQPR = PointRange<QQPoint>;
      //   QQPR QQ_Points(Points);
      //   QQPR QQ_Query_Points(Query_Points, QQ_Points.params);
      //   ANN_Quantized(G, k, BP, Query_Points, Q_Query_Points, QQ_Query_Points,
      //                 GT, res_file, graph_built, Points, Q_Points, QQ_Points);
      } else if (BP.quantize == 5) {
        using QQPoint = Mips_JL_Sparse_Point<1024>;
        using QQPR = PointRange<QQPoint>;
        QQPR QQ_Points(Points);
        QQPR QQ_Query_Points(Query_Points, QQ_Points.params);
        ANN_Quantized(G, k, BP, Query_Points, Q_Query_Points, QQ_Query_Points,
                      GT, res_file, graph_built, Points, Q_Points, QQ_Points);
      // } else if (BP.quantize == 6) {
      //   using QQPoint = Mips_JL_Sparse_Point_Normalized<1024>;
      //   using QQPR = PointRange<QQPoint>;
      //   QQPR QQ_Points(Points);
      //   QQPR QQ_Query_Points(Query_Points, QQ_Points.params);
      //   ANN_Quantized(G, k, BP, Query_Points, Q_Query_Points, QQ_Query_Points,
      //                 GT, res_file, graph_built, Points, Q_Points, QQ_Points);
      }
    }
  } else {
    ANN_Quantized(G, k, BP, Query_Points, Query_Points, Query_Points,
                  GT, res_file, graph_built, Points, Points, Points);
  }
}

} // end namespace


================================================
FILE: algorithms/vamana/neighbors.sh
================================================
#!/bin/bash
cd ~/ParlayANN/algorithms/vamana
make 

P=/ssd1/data/bigann
# ./neighbors -R 64 -L 128 -alpha 1.2 -data_type uint8 -dist_func Euclidian -base_path $P/base.1B.u8bin.crop_nb_1000000

# # PARLAY_NUM_THREADS=1 
# ./neighbors -R 64 -L 128 -alpha 1.2 -two_pass 0 -data_type uint8 -dist_func Euclidian -query_path $P/query.public.10K.u8bin -gt_path $P/bigann-1M -res_path test.csv -base_path $P/base.1B.u8bin.crop_nb_1000000
# # ./neighbors -R 64 -L 128 -alpha 1.2 -data_type uint8 -dist_func Euclidian -graph_path $P/graph-10M -query_path $P/query.public.10K.u8bin -gt_path $P/bigann-10M -res_path test.csv -base_path $P/base.1B.u8bin.crop_nb_10000000

# Q=/ssd1/data/text2image1B
# ./neighbors -R 64 -L 128 -alpha 1.0 -data_type float -two_pass 0 -dist_func mips -query_path $Q/query.public.10K.fbin -gt_path $Q/text2image-10K-1M -res_path test.csv -base_path $Q/base.1B.fbin.crop_nb_1000000

# V=/ssd1/data/MSSPACEV1B
# ./neighbors -R 64 -L 128 -alpha 1.2 -two_pass 0 -data_type int8 -dist_func Euclidian -query_path $V/query.i8bin -gt_path $V/msspacev-1M -res_path test.csv -base_path $V/spacev1b_base.i8bin.crop_nb_1000000


# ./neighbors -R 64 -L 128 -a 1.2 -data_type uint8 -dist_func Euclidian -query_path /ssd1/data/bigann/query.public.10K.u8bin -gt_path /ssd1/data/bigann/bigann-1M -res_path test.csv -base_path /ssd1/data/bigann/base.1B.u8bin.crop_nb_1000000

# P=/ssd1/data/FB_ssnpp
# make
# ./neighbors -R 128 -L 256 -alpha 1.0 -data_type uint8 -two_pass 0 -dist_func Euclidian -k 10 -query_path $P/ssnpp_nonzero.u8bin -gt_path $P/ssnpp-nonzero-1M -res_path test.csv -base_path $P/FB_ssnpp_database.u8bin.crop_nb_1000000

T=/ssd1/data/gist
./neighbors -R 64 -L 128 -alpha 1.2 -two_pass 0 -data_type float -dist_func Euclidian -query_path $T/gist_query.fbin -gt_path $T/gist-1M -res_path test.csv -base_path $T/gist_base.fbin


================================================
FILE: algorithms/vamana/neighbors_test.cc
================================================
#include "algorithms/vamana/neighbors.h"

#include <gtest/gtest.h>

TEST(PlaceHolderTest, BuildPlaceHolder) { EXPECT_EQ(7 * 6, 42); }


================================================
FILE: algorithms/vamana/scripts/OpenAIArXiv
================================================
# bash
NAME=OpenAIArXiv
BUILD_ARGS="-R 100 -L 200 -alpha 1.05 -num_passes 2 -quantize_mode 3 -verbose"
QUERY_ARGS="-quantize_bits 16 -quantize_mode 3 -verbose -rerank_factor 2"
TYPE_ARGS="-data_type float -dist_func Euclidian -file_type bin"

PATH=data/$NAME
DATA_FILE=$PATH/base.bin
QUERY_FILE=$PATH/query.bin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_100_1.05_m3

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamana/scripts/deep10M
================================================
# bash
NAME=deep10M
BUILD_ARGS="-R 64 -L 128 -alpha 1.05 -num_passes 2 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 16 -quantize_mode 1 -verbose -rerank_factor 2"
TYPE_ARGS="-data_type float -dist_func Euclidian -file_type bin"

PATH=data/$NAME
DATA_FILE=$PATH/base.fbin
QUERY_FILE=$PATH/query.fbin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_64_1.05

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamana/scripts/fashion
================================================
# bash
NAME=fashion-mnist-784-euclidean
BUILD_ARGS="-R 40 -L 80 -alpha 1.1 -num_passes 2 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 8 -verbose"
TYPE_ARGS="-data_type float -dist_func Euclidian -file_type bin"

PATH=data/$NAME
DATA_FILE=$PATH/base.fbin
QUERY_FILE=$PATH/query.fbin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/"graph_40_1.1"

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamana/scripts/gist
================================================
# bash
NAME=gist
BUILD_ARGS="-R 100 -L 200 -alpha 1.1 -num_passes 2 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 16 -quantize_mode 3 -verbose -rerank_factor 2"
TYPE_ARGS="-data_type float -dist_func Euclidian -file_type bin"

PATH=data/$NAME
DATA_FILE=$PATH/base.fbin
QUERY_FILE=$PATH/query.fbin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_100_1.1

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamana/scripts/glove100
================================================
# bash
NAME=glove-100-angular
BUILD_ARGS="-R 100 -L 200 -alpha 1 -num_passes 2 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 16 -quantize_mode 1 -verbose -rerank_factor 2"
TYPE_ARGS="-data_type float -dist_func mips -normalize -file_type bin"

PATH=data/$NAME
DATA_FILE=$PATH/base.fbin
QUERY_FILE=$PATH/query.fbin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_100_1

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamana/scripts/glove25
================================================
# bash
NAME=glove-25-angular
BUILD_ARGS="-R 100 -L 200 -alpha 1 -num_passes 2 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 16 -quantize_mode 1 -verbose -rerank_factor 2"
TYPE_ARGS="-data_type float -dist_func mips -normalize -file_type bin"

PATH=data/$NAME
DATA_FILE=$PATH/base.fbin
QUERY_FILE=$PATH/query.fbin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_100_1

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamana/scripts/msmarco_websearch
================================================
# bash
BUILD_ARGS="-R 64 -L 128 -alpha 1 -num_passes 1 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 16 -quantize_mode 5 -verbose -rerank_factor 2"
TYPE_ARGS="-data_type float -dist_func mips -file_type bin"

PATH=data/msmarco_websearch
DATA_FILE=$PATH/base.fbin
QUERY_FILE=$PATH/query.bin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_64_1

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamana/scripts/nytimes
================================================
# bash
BUILD_ARGS="-R 130 -L 260 -alpha .85 -num_passes 2 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 16 -quantize_mode 5 -verbose -rerank_factor 2"
TYPE_ARGS="-data_type float -dist_func mips -normalize -file_type bin"

PATH=data/nytimes-256-angular
DATA_FILE=$PATH/base.fbin
QUERY_FILE=$PATH/query.fbin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_130_85

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamana/scripts/sift
================================================
# bash
BUILD_ARGS="-R 64 -L 128 -alpha 1.15 -num_passes 2 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 8 -verbose"
TYPE_ARGS="-data_type float -dist_func Euclidian -file_type bin"

PATH=data/sift-128-euclidean
DATA_FILE=$PATH/base.fbin
QUERY_FILE=$PATH/query.fbin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/"graph_64_1.15"

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamana/scripts/sift100
================================================
# bash
BUILD_ARGS="-R 64 -L 128 -alpha 1.15 -num_passes 2 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 8 -verbose"
TYPE_ARGS="-data_type uint8 -dist_func Euclidian -file_type bin"

PATH=data/sift100
DATA_FILE=$PATH/base.uint8
QUERY_FILE=$PATH/query.uint8
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/"graph_64_1.15"

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamana/scripts/space_1
================================================
# bash
NAME=space_1
BUILD_ARGS="-R 64 -L 128 -alpha 1.1 -num_passes 2 -verbose"
QUERY_ARGS="-verbose"
TYPE_ARGS="-data_type int8 -dist_func Euclidian -file_type bin"

PATH=data/$NAME
DATA_FILE=$PATH/base.i8bin
QUERY_FILE=$PATH/query.i8bin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_64_1.1

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamana/scripts/space_10
================================================
# bash
NAME=space_10
BUILD_ARGS="-R 64 -L 128 -alpha 1.1 -num_passes 2 -verbose"
QUERY_ARGS="-verbose"
TYPE_ARGS="-data_type int8 -dist_func Euclidian -file_type bin"

PATH=data/$NAME
DATA_FILE=$PATH/base.i8bin
QUERY_FILE=$PATH/query.i8bin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_64_1.1

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamana/scripts/t2i_1
================================================
# bash
NAME=t2i_1
BUILD_ARGS="-R 100 -L 200 -alpha 1 -num_passes 2 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 16 -quantize_mode 1 -verbose -rerank_factor 2"
TYPE_ARGS="-data_type float -dist_func mips -file_type bin"

PATH=data/$NAME
DATA_FILE=$PATH/base.fbin
QUERY_FILE=$PATH/query.fbin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_100_1

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamana/scripts/t2i_10
================================================
# bash
NAME=t2i_10
BUILD_ARGS="-R 100 -L 200 -alpha 1 -num_passes 2 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 16 -quantize_mode 1 -verbose -rerank_factor 2"
TYPE_ARGS="-data_type float -dist_func mips -file_type bin"

PATH=data/$NAME
DATA_FILE=$PATH/base.fbin
QUERY_FILE=$PATH/query.fbin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_100_1

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamana/scripts/wikipedia_cohere
================================================
# bash
BUILD_ARGS="-R 64 -L 128 -alpha .98 -num_passes 1 -quantize_bits 8 -verbose"
QUERY_ARGS="-quantize_bits 16 -quantize_mode 5 -verbose -rerank_factor 6"
TYPE_ARGS="-data_type float -dist_func mips -file_type bin"

PATH=data/wikipedia_cohere
DATA_FILE=$PATH/base.bin
QUERY_FILE=$PATH/query.bin
GROUNDTRUTH_FILE=$PATH/groundtruth
GRAPH_FILE=$PATH/graphs/graph_64_98

# build
echo ./neighbors $BUILD_ARGS $TYPE_ARGS -base_path $DATA_FILE -graph_outfile $GRAPH_FILE

# query 
echo ./neighbors $QUERY_ARGS $TYPE_ARGS -base_path $DATA_FILE -query_path $QUERY_FILE -gt_path $GROUNDTRUTH_FILE -graph_path $GRAPH_FILE


================================================
FILE: algorithms/vamanaRange/CMakeLists.txt
================================================
# TODO: fix build
#add_executable(neighbors-vamanaRange ../bench/neighborsTime.C)
#  target_link_libraries(neighbors-vamanaRange PRIVATE parlay)
#  target_precompile_headers(neighbors-vamanaRange PRIVATE neighbors.h)


================================================
FILE: algorithms/vamanaRange/Makefile
================================================
include ../bench/parallelDefsANN

REQUIRE = ../utils/beamSearch.h index.h  ../utils/check_nn_recall.h ../utils/NSGDist.h ../utils/parse_results.h ../utils/graph.h ../utils/point_range.h
BENCH = neighbors

include ../bench/MakeBench


================================================
FILE: algorithms/vamanaRange/index.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <math.h>

#include <algorithm>
#include <random>
#include <set>

#include "../utils/NSGDist.h"
#include "../utils/point_range.h"
#include "../utils/graph.h"
#include "../utils/types.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/delayed.h"
#include "parlay/random.h"
#include "../utils/beamSearch.h"


template<typename Point, typename PointRange, typename indexType>
struct knn_index {
  using distanceType = typename Point::distanceType;
  using pid = std::pair<indexType, distanceType>;
  using PR = PointRange;
  using GraphI = Graph<indexType>;
  

  BuildParams BP;
  std::set<indexType> delete_set; 
  indexType start_point;

  knn_index(BuildParams &BP) : BP(BP) {}

  indexType get_start() { return start_point; }

  //robustPrune routine as found in DiskANN paper, with the exception
  //that the new candidate set is added to the field new_nbhs instead
  //of directly replacing the out_nbh of p
  std::pair<parlay::sequence<indexType>, long>
  robustPrune(indexType p, parlay::sequence<pid>& cand,
              GraphI &G, PR &Points, double alpha, bool add = true) {
    // add out neighbors of p to the candidate set.
    size_t out_size = G[p].size();
    std::vector<pid> candidates;
    long distance_comps = 0;
    for (auto x : cand) candidates.push_back(x);

    if(add){
      for (size_t i=0; i<out_size; i++) {
        distance_comps++;
        candidates.push_back(std::make_pair(G[p][i], Points[G[p][i]].distance(Points[p])));
      }
    }

    // Sort the candidate set according to distance from p
    auto less = [&](pid a, pid b) { return a.second < b.second; };
    std::sort(candidates.begin(), candidates.end(), less);

    // remove any duplicates
    auto new_end =std::unique(candidates.begin(), candidates.end(),
			      [&] (auto x, auto y) {return x.first == y.first;});
    candidates = std::vector(candidates.begin(), new_end);
    
    std::vector<indexType> new_nbhs;
    new_nbhs.reserve(BP.R);

    size_t candidate_idx = 0;

    while (new_nbhs.size() < BP.R && candidate_idx < candidates.size()) {
      // Don't need to do modifications.
      int p_star = candidates[candidate_idx].first;
      candidate_idx++;
      if (p_star == p || p_star == -1) {
        continue;
      }

      new_nbhs.push_back(p_star);

      for (size_t i = candidate_idx; i < candidates.size(); i++) {
        int p_prime = candidates[i].first;
        if (p_prime != -1) {
          distance_comps++;
          distanceType dist_starprime = Points[p_star].distance(Points[p_prime]);
          distanceType dist_pprime = candidates[i].second;
          if (alpha * dist_starprime <= dist_pprime) {
            candidates[i].first = -1;
          }
        }
      }
    }

    auto new_neighbors_seq = parlay::to_sequence(new_nbhs);
    return std::pair(new_neighbors_seq, distance_comps);
  }

  //wrapper to allow calling robustPrune on a sequence of candidates 
  //that do not come with precomputed distances
  std::pair<parlay::sequence<indexType>, long>
  robustPrune(indexType p, parlay::sequence<indexType> candidates,
              GraphI &G, PR &Points, double alpha, bool add = true){

    parlay::sequence<pid> cc;
    long distance_comps = 0;
    cc.reserve(candidates.size()); // + size_of(p->out_nbh));
    for (size_t i=0; i<candidates.size(); ++i) {
      distance_comps++;
      cc.push_back(std::make_pair(candidates[i], Points[candidates[i]].distance(Points[p])));
    }
    auto [ngh_seq, dc] = robustPrune(p, cc, G, Points, alpha, add);
    return std::pair(ngh_seq, dc + distance_comps);
  }

  // add ngh to candidates without adding any repeats
  template<typename rangeType1, typename rangeType2>
  void add_neighbors_without_repeats(const rangeType1 &ngh, rangeType2& candidates) {
    std::unordered_set<indexType> a;
    for (auto c : candidates) a.insert(c);
    for (int i=0; i < ngh.size(); i++) 
      if (a.count(ngh[i]) == 0) candidates.push_back(ngh[i]);
  }

  void set_start(){start_point = 0;}

  void build_index(GraphI &G, PR &Points, stats<indexType> &BuildStats, bool sort_neighbors = true){
    std::cout << "Building graph..." << std::endl;
    set_start();
    parlay::sequence<indexType> inserts = parlay::tabulate(Points.size(), [&] (size_t i){
					    return static_cast<indexType>(i);});

    if(BP.two_pass) batch_insert(inserts, G, Points, BuildStats, 1.0, true, 2, .02);
    batch_insert(inserts, G, Points, BuildStats, BP.alpha, true, 2, .02);
    if (sort_neighbors) {
      parlay::parallel_for (0, G.size(), [&] (long i) {
        auto less = [&] (indexType j, indexType k) {
                      return Points[i].distance(Points[j]) < Points[i].distance(Points[k]);};
        G[i].sort(less);});
    }
  }

  void batch_insert(parlay::sequence<indexType> &inserts,
                     GraphI &G, PR &Points, stats<indexType> &BuildStats, double alpha,
                    bool random_order = false, double base = 2,
                    double max_fraction = .02, bool print=true) {
    for(int p : inserts){
      if(p < 0 || p > (int) G.size()){
        std::cout << "ERROR: invalid point "
                  << p << " given to batch_insert" << std::endl;
        abort();
      }
    }
    size_t n = G.size();
    size_t m = inserts.size();
    size_t inc = 0;
    size_t count = 0;
    float frac = 0.0;
    float progress_inc = .1;
    size_t max_batch_size = std::min(
        static_cast<size_t>(max_fraction * static_cast<float>(n)), 1000000ul);
    //fix bug where max batch size could be set to zero 
    if(max_batch_size == 0) max_batch_size = n;
    parlay::sequence<int> rperm;
    if (random_order)
      rperm = parlay::random_permutation<int>(static_cast<int>(m));
    else
      rperm = parlay::tabulate(m, [&](int i) { return i; });
    auto shuffled_inserts =
        parlay::tabulate(m, [&](size_t i) { return inserts[rperm[i]]; });
    parlay::internal::timer t_beam("beam search time");
    parlay::internal::timer t_bidirect("bidirect time");
    parlay::internal::timer t_prune("prune time");
    t_beam.stop();
    t_bidirect.stop();
    t_prune.stop();
    while (count < m) {
      size_t floor;
      size_t ceiling;
      if (pow(base, inc) <= max_batch_size) {
        floor = static_cast<size_t>(pow(base, inc)) - 1;
        ceiling = std::min(static_cast<size_t>(pow(base, inc + 1)) - 1, m);
        count = std::min(static_cast<size_t>(pow(base, inc + 1)) - 1, m);
      } else {
        floor = count;
        ceiling = std::min(count + static_cast<size_t>(max_batch_size), m);
        count += static_cast<size_t>(max_batch_size);
      }
      parlay::sequence<parlay::sequence<indexType>> new_out_(ceiling-floor);
      // search for each node starting from the start_point, then call
      // robustPrune with the visited list as its candidate set
      t_beam.start();
      parlay::parallel_for(floor, ceiling, [&](size_t i) {
        size_t index = shuffled_inserts[i];
        QueryParams QP((long) 0, BP.L, (double) 0.0, (long) Points.size(), (long) G.max_degree());
        auto [beam_visited, bs_distance_comps] =
          beam_search<Point, PointRange, indexType>(Points[index], G, Points, start_point, QP);
        auto [beam, visited] = beam_visited;
        BuildStats.increment_dist(index, bs_distance_comps);
        BuildStats.increment_visited(index, visited.size());

        long rp_distance_comps;
        std::tie(new_out_[i-floor], rp_distance_comps) = robustPrune(index, visited, G, Points, alpha);
        BuildStats.increment_dist(index, rp_distance_comps);
      });
      t_beam.stop();

      // make each edge bidirectional by first adding each new edge
      //(i,j) to a sequence, then semisorting the sequence by key values
      t_bidirect.start();

      auto flattened = parlay::delayed::flatten(parlay::tabulate(ceiling - floor, [&](size_t i) {
        indexType index = shuffled_inserts[i + floor];
        return parlay::delayed::map(new_out_[i], [=] (indexType ngh) {
                                      return std::pair(ngh, index);});}));
      auto grouped_by = parlay::group_by_key_ordered(parlay::delayed::to_sequence(flattened));
      
      parlay::parallel_for(floor, ceiling, [&](size_t i) {
         G[shuffled_inserts[i]].update_neighbors(new_out_[i-floor]);
      });

      t_bidirect.stop();
      t_prune.start();
      // finally, add the bidirectional edges; if they do not make
      // the vertex exceed the degree bound, just add them to out_nbhs;
      // otherwise, use robustPrune on the vertex with user-specified alpha
      parlay::parallel_for(0, grouped_by.size(), [&](size_t j) {
        auto &[index, candidates] = grouped_by[j];
	size_t newsize = candidates.size() + G[index].size();
        if (newsize <= BP.R) {
	  add_neighbors_without_repeats(G[index], candidates);
	  G[index].update_neighbors(candidates);
        } else {
          auto [new_out_2_, distance_comps] = robustPrune(index, std::move(candidates), G, Points, alpha);
	  G[index].update_neighbors(new_out_2_);    
          BuildStats.increment_dist(index, distance_comps);
        }
      });
      t_prune.stop();
      if (print) {
        auto ind = frac * n;
        if (floor <= ind && ceiling > ind) {
          frac += progress_inc;
          std::cout << "Pass " << 100 * frac << "% complete"
                    << std::endl;
        }
      }
      inc += 1;
    }
    t_beam.total();
    t_bidirect.total();
    t_prune.total();
  }

};


================================================
FILE: algorithms/vamanaRange/neighbors.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include <algorithm>

#include "../utils/NSGDist.h"
#include "../utils/beamSearch.h"
#include "../utils/check_nn_recall.h"
#include "../utils/parse_results.h"
#include "../utils/mips_point.h"
#include "../utils/euclidian_point.h"
#include "../utils/stats.h"
#include "../utils/types.h"
#include "../utils/graph.h"
#include "index.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/random.h"


template<typename Point, typename PointRange_, typename indexType>
void ANN(Graph<indexType> &G, long k, BuildParams &BP,
         PointRange_ &Query_Points,
         groundTruth<indexType> GT, char *res_file,
         bool graph_built, PointRange_ &Points) {

  indexType start_point;
  double idx_time;
  stats<unsigned int> BuildStats(G.size());
  parlay::internal::timer t("ANN");
  if(graph_built){
    idx_time = 0;
  } else {
    if (sizeof(typename PointRange_::T) >= 4) {
      if (Points[0].is_metric()) {
        using QT = uint8_t;
        using QPoint = Euclidian_Point<QT>;
        using QPR = PointRange<QT, QPoint>;
        QPR pr(Points);
        using findex = knn_index<QPoint, QPR, indexType>;
        findex I(BP);
        I.build_index(G, pr, BuildStats, false);
        start_point = I.get_start();
      } else {
        using QT = uint8_t;
        using QPoint = Quantized_Mips_Point<QT>;
        using QPR = PointRange<QT, QPoint>;
        QPR pr(Points);
        using findex = knn_index<QPoint, QPR, indexType>;
        findex I(BP);
        I.build_index(G, pr, BuildStats, false);
        start_point = I.get_start();
      }
    } else {
      using findex = knn_index<Point, PointRange_, indexType>;
      findex I(BP);
      I.build_index(G, Points, BuildStats, false);
      start_point = I.get_start();
    }
    idx_time = t.next_time();
  }

  std::string name = "Vamana";
  std::string params =
      "R = " + std::to_string(BP.R) + ", L = " + std::to_string(BP.L);
  auto [avg_deg, max_deg] = graph_stats_(G);
  auto vv = BuildStats.visited_stats();

  long build_num_distances = parlay::reduce(parlay::map(BuildStats.distances, [] (auto x) {return (long) x;}));

  Graph_ G_(name, params, G.size(), avg_deg, max_deg, idx_time);
  G_.print();

  parlay::internal::timer t_range("range search time");
  double radius = BP.radius;
  double radius_2 = BP.radius_2;
  std::cout << "radius = " << radius << " radius_2 = " << radius_2 << std::endl;
  QueryParams QP;
  QP.limit = (long) G.size();
  QP.degree_limit = (long) G.max_degree();
  QP.cut = 1.535;
  QP.k = 0;
  QP.beamSize = 45;
  long n = Points.size();
  parlay::sequence<long> counts(n);
  parlay::sequence<long> distance_comps(n);
  parlay::parallel_for(0, G.size(), [&] (long i) {
    parlay::sequence<indexType> pts;
    pts.push_back(Points[i].id()); //Points[i].id());
    auto [r, dc] = range_search(Points[i], G, Points, pts, radius, radius_2, QP, true);
    counts[i] = r.size();
    distance_comps[i] = dc;});
  t_range.total();
  long range_num_distances = parlay::reduce(distance_comps);

  std::cout << "edges within range: " << parlay::reduce(counts) << std::endl;
  std::cout << "distance comparisons during build = " << build_num_distances << std::endl;
  std::cout << "distance comparisons during range = " << range_num_distances << std::endl;

  // brute force
  if (false) {
    parlay::sequence<parlay::sequence<indexType>> in_radius(G.size());
    parlay::parallel_for(0, G.size(), [&] (long i) {
      if (i % 10000 == 0) std::cout << "." << std::flush;
      parlay::sequence<indexType> pts;
      long cnt = 0;
      for (long j=0; j < i; j++) 
        if (Points[i].distance(Points[j]) <= radius) {
          in_radius[i].push_back(j);
          //in_radius[j].push_back(i);
        }
                                      }, 1);
    parlay::parallel_for (0, G.size(), [&] (long i) {
                                         //std::sort(in_radius[i].begin(), in_radius[i].end());
                                         counts[i] = in_radius[i].size();
                                       });
    
    std::cout << "gt count: " << parlay::reduce(counts) * 2 << std::endl;
  }
}


================================================
FILE: build/_deps/parlaylib-subbuild/CMakeLists.txt
================================================
# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
# file Copyright.txt or https://cmake.org/licensing for details.

cmake_minimum_required(VERSION 3.28.0)

# Reject any attempt to use a toolchain file. We must not use one because
# we could be downloading it here. If the CMAKE_TOOLCHAIN_FILE environment
# variable is set, the cache variable will have been initialized from it.
unset(CMAKE_TOOLCHAIN_FILE CACHE)
unset(ENV{CMAKE_TOOLCHAIN_FILE})

# We name the project and the target for the ExternalProject_Add() call
# to something that will highlight to the user what we are working on if
# something goes wrong and an error message is produced.

project(parlaylib-populate NONE)


# Pass through things we've already detected in the main project to avoid
# paying the cost of redetecting them again in ExternalProject_Add()
set(GIT_EXECUTABLE [==[/usr/bin/git]==])
set(GIT_VERSION_STRING [==[2.25.1]==])
set_property(GLOBAL PROPERTY _CMAKE_FindGit_GIT_EXECUTABLE_VERSION
  [==[/usr/bin/git;2.25.1]==]
)


include(ExternalProject)
ExternalProject_Add(parlaylib-populate
                     "UPDATE_DISCONNECTED" "False" "GIT_REPOSITORY" "https://github.com/cmuparlay/parlaylib.git" "EXTERNALPROJECT_INTERNAL_ARGUMENT_SEPARATOR" "GIT_TAG" "master"
                    SOURCE_DIR          "/usr0/home/guyb/cvs/ParlayANN/build/_deps/parlaylib-src"
                    BINARY_DIR          "/usr0/home/guyb/cvs/ParlayANN/build/_deps/parlaylib-build"
                    CONFIGURE_COMMAND   ""
                    BUILD_COMMAND       ""
                    INSTALL_COMMAND     ""
                    TEST_COMMAND        ""
                    USES_TERMINAL_DOWNLOAD  YES
                    USES_TERMINAL_UPDATE    YES
                    USES_TERMINAL_PATCH     YES
)


================================================
FILE: data_tools/Makefile
================================================
include ../algorithms/bench/parallelDefsANN

vec_to_bin : vec_to_bin.cpp
	$(CC) $(CFLAGS) -o vec_to_bin vec_to_bin.cpp $(LFLAGS) 

compute_groundtruth : compute_groundtruth.cpp
	$(CC) $(CFLAGS) -o compute_groundtruth compute_groundtruth.cpp $(LFLAGS) 

compute_range_groundtruth : compute_range_groundtruth.cpp
	$(CC) $(CFLAGS) -o compute_range_groundtruth compute_range_groundtruth.cpp $(LFLAGS) 

crop : crop.cpp
	$(CC) $(CFLAGS) -o crop crop.cpp $(LFLAGS) 

random_sample : random_sample.cpp
	$(CC) $(CFLAGS) -o random_sample random_sample.cpp $(LFLAGS) 

================================================
FILE: data_tools/compute_groundtruth.cpp
================================================
/*
  Example usage:
    ./compute_groundtruth -base_path ~/data/sift/sift-1M \
    -query_path ~/data/sift/query-10K -data_type uint8 \
    -dist_func Euclidian -k 100 -gt_path ~/data/sift/GT/sift-1M.gt
*/

#include <iostream>
#include <algorithm>
#include <cstdint>
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/io.h"
#include "utils/euclidian_point.h"
#include "utils/mips_point.h"
#include "utils/point_range.h"
#include "../algorithms/bench/parse_command_line.h"

using pid = std::pair<int, float>;
using namespace parlayANN;

template<typename PointRange>
parlay::sequence<parlay::sequence<pid>> compute_groundtruth(PointRange &B, 
  PointRange &Q, int k){
    unsigned d = B.dimension();
    size_t q = Q.size();
    size_t b = B.size();
    auto answers = parlay::tabulate(q, [&] (size_t i){  
        float topdist = B[0].d_min();   
        int toppos;
        parlay::sequence<pid> topk;
        for(size_t j=0; j<b; j++){
            // float dist = D->distance((Q[i].coordinates).begin(), (B[j].coordinates).begin(), d);
            float dist = Q[i].distance(B[j]);
            if(topk.size() < k){
                if(dist > topdist){
                    topdist = dist;   
                    toppos = topk.size();
                }
                topk.push_back(std::make_pair((int) j, dist));
            }
            else if(dist < topdist){
                float new_topdist=B[0].d_min();  
                int new_toppos=0;
                topk[toppos] = std::make_pair((int) j, dist);
                for(size_t l=0; l<topk.size(); l++){
                    if(topk[l].second > new_topdist){
                        new_topdist = topk[l].second;
                        new_toppos = (int) l;
                    }
                }
                topdist = new_topdist;
                toppos = new_toppos;
            }
        }
        return topk;
    });
    std::cout << "Done computing groundtruth" << std::endl;
    return answers;
}

// ibin is the same as the binary groundtruth format used in the
// big-ann-benchmarks (see: https://big-ann-benchmarks.com/neurips21.html)
void write_ibin(parlay::sequence<parlay::sequence<pid>> &result, const std::string outFile, int k){
    std::cout << "Writing file with dimension " << result[0].size() << std::endl;
    std::cout << "File contains groundtruth for " << result.size() << " query points" << std::endl;

    auto less = [&] (pid a, pid b) {return a.second < b.second;};
    parlay::sequence<int> preamble = {static_cast<int>(result.size()), static_cast<int>(result[0].size())};
    size_t n = result.size();
    parlay::parallel_for(0, result.size(), [&] (size_t i){
      parlay::sort_inplace(result[i], less);
    });
    auto ids = parlay::tabulate(result.size(), [&] (size_t i){
        parlay::sequence<int> data;
        for(int j=0; j<k; j++){
          data.push_back(static_cast<int>(result[i][j].first));
        }
        return data;
    });
    auto distances = parlay::tabulate(result.size(), [&] (size_t i){
        parlay::sequence<float> data;
        for(int j=0; j<k; j++){
          data.push_back(static_cast<float>(result[i][j].second));
        }
        return data;
    });
    parlay::sequence<int> flat_ids = parlay::flatten(ids);
    parlay::sequence<float> flat_dists = parlay::flatten(distances);

    auto pr = preamble.begin();
    auto id_data = flat_ids.begin();
    auto dist_data = flat_dists.begin();
    std::ofstream writer;
    writer.open(outFile, std::ios::binary | std::ios::out);
    writer.write((char *) pr, 2*sizeof(int));
    writer.write((char *) id_data, n * k * sizeof(int));
    writer.write((char *) dist_data, n * k * sizeof(float));
    writer.close();
}

int main(int argc, char* argv[]) {
  commandLine P(argc,argv,
  "[-base_path <b>] [-query_path <q>] "
      "[-data_type <d>] [-k <k> ] [-dist_func <d>] [-gt_path <outfile>]");

  char* gFile = P.getOptionValue("-gt_path");
  char* qFile = P.getOptionValue("-query_path");
  char* bFile = P.getOptionValue("-base_path");
  char* vectype = P.getOptionValue("-data_type");
  char* dfc = P.getOptionValue("-dist_func");
  int k = P.getOptionIntValue("-k", 100);

  std::string df = std::string(dfc);
  if(df != "Euclidian" && df != "mips"){
    std::cout << "Error: invalid distance type: specify Euclidian or mips" << std::endl;
    abort();
  }

  std::string tp = std::string(vectype);
  if((tp != "uint8") && (tp != "int8") && (tp != "float")){
    std::cout << "Error: data type not specified correctly, specify int8, uint8, or float" << std::endl;
    abort();
  }

  std::cout << "Computing the " << k << " nearest neighbors" << std::endl;

  int maxDeg = 0;

  parlay::sequence<parlay::sequence<pid>> answers;
  std::string base = std::string(bFile);
  std::string query = std::string(qFile);


  if(tp == "float"){
    std::cout << "Detected float coordinates" << std::endl;
    if(df == "Euclidian"){
      auto B = PointRange<Euclidian_Point<float>>(bFile);
      auto Q = PointRange<Euclidian_Point<float>>(qFile);
      answers = compute_groundtruth<PointRange<Euclidian_Point<float>>>(B, Q, k);
    } else if(df == "mips"){
      auto B = PointRange<Mips_Point<float>>(bFile);
      auto Q = PointRange<Mips_Point<float>>(qFile);
      answers = compute_groundtruth<PointRange<Mips_Point<float>>>(B, Q, k);
    }
  }else if(tp == "uint8"){
    std::cout << "Detected uint8 coordinates" << std::endl;
    if(df == "Euclidian"){
      auto B = PointRange<Euclidian_Point<uint8_t>>(bFile);
      auto Q = PointRange<Euclidian_Point<uint8_t>>(qFile);
      answers = compute_groundtruth<PointRange<Euclidian_Point<uint8_t>>>(B, Q, k);
    } else if(df == "mips"){
      auto B = PointRange<Mips_Point<uint8_t>>(bFile);
      auto Q = PointRange<Mips_Point<uint8_t>>(qFile);
      answers = compute_groundtruth<PointRange<Mips_Point<uint8_t>>>(B, Q, k);
    }
  } else if(tp == "int8"){
    std::cout << "Detected int8 coordinates" << std::endl;
    if(df == "Euclidian"){
      auto B = PointRange<Euclidian_Point<int8_t>>(bFile);
      auto Q = PointRange<Euclidian_Point<int8_t>>(qFile);
      answers = compute_groundtruth<PointRange<Euclidian_Point<int8_t>>>(B, Q, k);
    } else if(df == "mips"){
      auto B = PointRange<Mips_Point<int8_t>>(bFile);
      auto Q = PointRange<Mips_Point<int8_t>>(qFile);
      answers = compute_groundtruth<PointRange<Mips_Point<int8_t>>>(B, Q, k);
    }
  }
  write_ibin(answers, std::string(gFile), k);

  return 0;
}


================================================
FILE: data_tools/compute_range_groundtruth.cpp
================================================
#include <iostream>
#include <algorithm>
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/io.h"
// #include "utils/types.h"
#include "utils/euclidian_point.h"
#include "utils/mips_point.h"
#include "utils/point_range.h"
#include "../algorithms/bench/parse_command_line.h"

using namespace parlayANN;

template<typename PointRange>
parlay::sequence<parlay::sequence<int>> compute_range_groundtruth(PointRange &B, 
  PointRange &Q, float r){
    unsigned d = B.dimension();
    size_t q = Q.size();
    size_t b = B.size();
    auto answers = parlay::tabulate(q, [&] (size_t i){  
        parlay::sequence<int> results;
        for(size_t j=0; j<b; j++){
            float dist = Q[i].distance(B[j]);
            if(dist <= r) results.push_back(j);
        }
        return results;
    });
    std::cout << "Done computing groundtruth" << std::endl;
    return answers;
}

template<typename PointRange, typename T>
void write_nonzero_elts(parlay::sequence<parlay::sequence<int>> &result, PointRange Query_Points, const std::string outFile){
  size_t n = result.size();
    parlay::sequence<int> sizes = parlay::tabulate(n, [&] (size_t i){
        if(result[i].size() > 0) return 1;
        return 0;
    });
    size_t num_nonzero = parlay::reduce(sizes);

    std::cout << "Number of nonzero elements: " << num_nonzero << std::endl;
    int d = Query_Points.dimension();
    parlay::sequence<int> preamble = {static_cast<int>(num_nonzero), static_cast<int>(d)};
    parlay::sequence<T> data(num_nonzero*d);
    parlay::sequence<parlay::sequence<int>> to_flatten = parlay::tabulate(n, [&] (size_t i){
      parlay::sequence<int> ret;
      if(result[i].size() > 0) ret.push_back(i);
      return ret;
    });
    parlay::sequence<int> indices = parlay::flatten(to_flatten);
    if(indices.size() != num_nonzero) abort();
    parlay::parallel_for(0, indices.size(), [&] (size_t i){
      for(int j=0; j<d; j++) data[d*i+j] = Query_Points[indices[i]][j];
    });

    std::ofstream writer;
    writer.open(outFile, std::ios::binary | std::ios::out);
    writer.write((char *) (preamble.begin()), 2*sizeof(int));
    writer.write((char *) (data.begin()), num_nonzero * sizeof(T) * d);
    writer.close();


}

void write_rangeres(parlay::sequence<parlay::sequence<int>> &result, const std::string outFile){
    std::cout << "File contains range groundtruth for " << result.size() << " data points" << std::endl;

    
    size_t n = result.size();
    parlay::sequence<int> sizes = parlay::tabulate(n, [&] (size_t i){
        return static_cast<int>(result[i].size());
    });
    size_t num_matches = parlay::reduce(sizes);

    std::cout << "Number of nonzero matches: " << num_matches << std::endl;
    parlay::sequence<int> preamble = {static_cast<int>(n), static_cast<int>(num_matches)};

    auto flat_ids = parlay::flatten(result);

    auto pr = preamble.begin();
    auto size_data = sizes.begin();
    auto id_data = flat_ids.begin();
    std::ofstream writer;
    writer.open(outFile, std::ios::binary | std::ios::out);
    writer.write((char *) pr, 2*sizeof(int));
    writer.write((char *) size_data, n * sizeof(int));
    writer.write((char *) id_data, num_matches * sizeof(int));
    writer.close();
}


int main(int argc, char* argv[]) {
  commandLine P(argc,argv,
  "[-base_path <b>] [-query_path <q>] "
      "[-data_type <d>] [-r <r> ] [-dist_func <d>] [-gt_path <outfile>]");

  char* gFile = P.getOptionValue("-gt_path");
  char* qFile = P.getOptionValue("-query_path");
  char* bFile = P.getOptionValue("-base_path");
  char* vectype = P.getOptionValue("-data_type");
  char* dfc = P.getOptionValue("-dist_func");
  float r = P.getOptionDoubleValue("-r", 0);

  std::string df = std::string(dfc);
  if(df != "Euclidian" && df != "mips"){
    std::cout << "Error: invalid distance type: specify Euclidian or mips" << std::endl;
    abort();
  }

  std::string tp = std::string(vectype);
  if((tp != "uint8") && (tp != "int8") && (tp != "float")){
    std::cout << "Error: data type not specified correctly, specify int8, uint8, or float" << std::endl;
    abort();
  }

  std::cout << "Computing the groundtruth for radius " << r << std::endl;

  parlay::sequence<parlay::sequence<int>> answers;

  if(tp == "float"){
    std::cout << "Detected float coordinates" << std::endl;
    if(df == "Euclidian"){
      PointRange<Euclidian_Point<float>> B = PointRange<Euclidian_Point<float>>(bFile);
      PointRange<Euclidian_Point<float>> Q = PointRange<Euclidian_Point<float>>(qFile);
      answers = compute_range_groundtruth<PointRange<Euclidian_Point<float>>>(B, Q, r);
    } else if(df == "mips"){
      PointRange<Mips_Point<float>> B = PointRange<Mips_Point<float>>(bFile);
      PointRange<Mips_Point<float>> Q = PointRange<Mips_Point<float>>(qFile);
      answers = compute_range_groundtruth<PointRange<Mips_Point<float>>>(B, Q, r);
    }
  }else if(tp == "uint8"){
    std::cout << "Detected uint8 coordinates" << std::endl;
    if(df == "Euclidian"){
      PointRange<Euclidian_Point<uint8_t>> B = PointRange<Euclidian_Point<uint8_t>>(bFile);
      PointRange<Euclidian_Point<uint8_t>> Q = PointRange<Euclidian_Point<uint8_t>>(qFile);
      answers = compute_range_groundtruth<PointRange<Euclidian_Point<uint8_t>>>(B, Q, r);
    } else if(df == "mips"){
      PointRange<Mips_Point<uint8_t>> B = PointRange<Mips_Point<uint8_t>>(bFile);
      PointRange<Mips_Point<uint8_t>> Q = PointRange<Mips_Point<uint8_t>>(qFile);
      answers = compute_range_groundtruth<PointRange<Mips_Point<uint8_t>>>(B, Q, r);
    }
  }else if(tp == "int8"){
    std::cout << "Detected int8 coordinates" << std::endl;
    if(df == "Euclidian"){
      PointRange<Euclidian_Point<int8_t>> B = PointRange<Euclidian_Point<int8_t>>(bFile);
      PointRange<Euclidian_Point<int8_t>> Q = PointRange<Euclidian_Point<int8_t>>(qFile);
      answers = compute_range_groundtruth<PointRange<Euclidian_Point<int8_t>>>(B, Q, r);
    } else if(df == "mips"){
      PointRange<Mips_Point<int8_t>> B = PointRange<Mips_Point<int8_t>>(bFile);
      PointRange<Mips_Point<int8_t>> Q = PointRange<Mips_Point<int8_t>>(qFile);
      answers = compute_range_groundtruth<PointRange<Mips_Point<int8_t>>>(B, Q, r);
    }
  }
  write_rangeres(answers, std::string(gFile));
  

  return 0;
}


================================================
FILE: data_tools/crop.cpp
================================================
#include <iostream>
#include <algorithm>
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/io.h"
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

// using namespace benchIO;
// *************************************************************
// Parsing code (should move to common?)
// *************************************************************

// returns a pointer and a length
std::pair<char*, size_t> mmapStringFromFile(const char* filename) {
  struct stat sb;
  int fd = open(filename, O_RDONLY);
  if (fd == -1) {
    perror("open");
    exit(-1);
  }
  if (fstat(fd, &sb) == -1) {
    perror("fstat");
    exit(-1);
  }
  if (!S_ISREG(sb.st_mode)) {
    perror("not a file\n");
    exit(-1);
  }
  char* p =
      static_cast<char*>(mmap(0, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0));
  if (p == MAP_FAILED) {
    perror("mmap");
    exit(-1);
  }
  if (close(fd) == -1) {
    perror("close");
    exit(-1);
  }
  size_t n = sb.st_size;
  return std::make_pair(p, n);
}

template<typename T>
void crop_file(char* iFile, int n, char* oFile){
  auto [fileptr, length] = mmapStringFromFile(iFile);

  int dim = *((int*) (fileptr+4));
  std::cout << "Writing " << n << " points with dimension " << dim << std::endl;
  parlay::sequence<int> preamble = {n, dim};

  T* data = (T*)(fileptr+8);
  std::ofstream writer;
  writer.open(oFile, std::ios::binary | std::ios::out);

  size_t bytes_to_write = n;
  bytes_to_write *= dim;
  bytes_to_write *= sizeof(T);

  writer.write((char *)(preamble.begin()), 2*sizeof(int));
  writer.write((char *) data, bytes_to_write);
  writer.close();
}

int main(int argc, char* argv[]) {
  if (argc != 5) {
    std::cout << "usage: crop <base> <num_points_to_crop> <tp> <oF>" << std::endl;
    return 1;
  }
  

  int n = atoi(argv[2]);

  std::string tp = std::string(argv[3]);

  if(tp == "float") crop_file<float>(argv[1], n, argv[4]);
  else if(tp == "uint8") crop_file<uint8_t>(argv[1], n, argv[4]);
  else if(tp == "int8") crop_file<int8_t>(argv[1], n, argv[4]);
  else{
    std::cout << "Invalid type, specify float, uint8, or int8" << std::endl;
  }

  return 0;
}

================================================
FILE: data_tools/random_sample.cpp
================================================
#include <iostream>
#include <algorithm>
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/io.h"
#include "parlay/random.h"
#include "utils/mmap.h"

#include <random>

using namespace parlayANN;

template<typename T>
void random_sample(char* iFile, int n, char* oFile){
    auto [fileptr, length] = mmapStringFromFile(iFile);

    int fsize = *((int*) fileptr);
    int dim = *((int*) (fileptr+4));
    std::cout << "Writing " << n << " points with dimension " << dim << std::endl;
    parlay::sequence<int> preamble = {n, dim};

    parlay::random_generator gen;
    std::uniform_int_distribution<long> dis(0, fsize - 1);
    auto indices = parlay::tabulate(n, [&](size_t i) {
        auto r = gen[i];
        return dis(r);
    });

    T* start = (T*)(fileptr + 8);

    auto to_flatten = parlay::tabulate(n, [&] (size_t i){
        parlay::sequence<T> data;
        for(int j=0; j<dim; j++){
            data.push_back(*(start + dim*indices[i] + j));
        }
        return data;
    });

    auto data = parlay::flatten(to_flatten);

    std::ofstream writer;
    writer.open(oFile, std::ios::binary | std::ios::out);
    writer.write((char *)(preamble.begin()), 2*sizeof(int));
    writer.write((char *)(data.begin()), dim*n*sizeof(T));
    writer.close();
}

int main(int argc, char* argv[]) {
  if (argc != 5) {
    std::cout << "usage: random_sample <base> <num_points_to_crop> <tp> <oF>" << std::endl;
    return 1;
  }
  

  int n = atoi(argv[2]);

  std::string tp = std::string(argv[3]);

  if(tp == "float") random_sample<float>(argv[1], n, argv[4]);
  else if(tp == "uint8") random_sample<uint8_t>(argv[1], n, argv[4]);
  else if(tp == "int8") random_sample<int8_t>(argv[1], n, argv[4]);
  else{
    std::cout << "Invalid type, specify float, uint8, or int8" << std::endl;
  }

  return 0;
}

================================================
FILE: data_tools/vec_to_bin.cpp
================================================
#include <iostream>
#include <algorithm>
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/io.h"

// convert from .bvec file to .u8bin file


auto convert_onebyte(const char* infile, const char* outfile) {
  auto str = parlay::chars_from_file(infile);
  int dims = *((int *) str.data());
  int n = str.size()/(dims+4);
  std::cout << "n = " << n << " d = " << dims << std::endl;
  auto vects = parlay::tabulate(n, [&] (size_t i) {
		     return parlay::to_sequence(str.cut(4 + i * (4 + dims), (i+1) * (4 + dims)));});
  parlay::sequence<char> head(8);
  *((int *) head.data()) = n;
  *(((int *) head.data()) + 1) = dims;
  auto strout = parlay::append(head, parlay::flatten(vects));
  parlay::chars_to_file(strout, outfile);
}

auto convert_fourbyte(const char* infile, const char* outfile) {
  auto str = parlay::chars_from_file(infile);
  int dims = *((int *) str.data());
  int n = str.size()/(4*dims+4);
  std::cout << "n = " << n << " d = " << dims << std::endl;
  auto vects = parlay::tabulate(n, [&] (size_t i) {
		     return parlay::to_sequence(str.cut(4 + i * (4 + 4*dims), (i+1) * (4 + 4*dims)));});
  parlay::sequence<char> head(8);
  *((int *) head.data()) = n;
  *(((int *) head.data()) + 1) = dims;
  auto strout = parlay::append(head, parlay::flatten(vects));
  parlay::chars_to_file(strout, outfile);
}

int main(int argc, char* argv[]) {
  if (argc != 4) {
    std::cout << "usage: vec_to_bin type <infile> <outfile>" << std::endl;
    return 1;
  }
  std::string tp = std::string(argv[1]);
  if(tp == "uint8") convert_onebyte(argv[2], argv[3]);
  else if(tp == "float" | tp == "int") convert_fourbyte(argv[2], argv[3]);
  else{
    std::cout << "invalid type: specify uint8, float, or int" << std::endl;
    abort();
  }
  return 0;
}


================================================
FILE: docs/README.md
================================================
# ParlayANN

ParlayANN is a library of approximate nearest neighbor search algorithms, along with a set of useful tools for designing such algorithms. It is written in C++ and uses parallel primitives from [ParlayLib](https://cmuparlay.github.io/parlaylib/). Currently it includes implementations of the ANNS algorithms [DiskANN](https://github.com/microsoft/DiskANN), [HNSW](https://github.com/nmslib/hnswlib), [HCNNG](https://github.com/jalvarm/hcnng), and [pyNNDescent](https://pynndescent.readthedocs.io/en/latest/).

To install, [clone the repo](https://github.com/cmuparlay/ParlayANN/tree/main) and then initiate the ParlayLib submodule:

```bash
git submodule init
git submodule update
```

See the following documentation for help getting started:
- [Quickstart](https://cmuparlay.github.io/ParlayANN/quickstart)
- [Algorithms](https://cmuparlay.github.io/ParlayANN/algorithms)
- [Data Tools](https://cmuparlay.github.io/ParlayANN/data_tools)

This repository was built for our paper [Scaling Graph-Based ANNS Algorithms to Billion-Size Datasets: A Comparative Analsyis](https://arxiv.org/abs/2305.04359). If you use this repository for your own work, please cite us:

```bibtex
@inproceedings{ANNScaling,
  author = {Manohar, Magdalen Dobson and Shen, Zheqi and Blelloch, Guy and Dhulipala, Laxman and Gu, Yan and Simhadri, Harsha Vardhan and Sun, Yihan},
  title = {ParlayANN: Scalable and Deterministic Parallel Graph-Based Approximate Nearest Neighbor Search Algorithms},
  year = {2024},
  isbn = {9798400704352},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3627535.3638475},
  doi = {10.1145/3627535.3638475},
  booktitle = {Proceedings of the 29th ACM SIGPLAN Annual Symposium on Principles and Practice of Parallel Programming},
  pages = {270–285},
  numpages = {16},
  keywords = {nearest neighbor search, vector search, parallel algorithms},
  location = {Edinburgh, United Kingdom},
  series = {PPoPP '24}
}
```

The range search algorithms are from our paper [Range Retrieval with Graph-Based Indices](https://arxiv.org/abs/2502.13245). If you use this repository for your own work, please cite us:

```bibtex
@misc{manohar2025range,
      title={Range Retrieval with Graph-Based Indices}, 
      author={Magdalen Dobson Manohar and Taekseung Kim and Guy E. Blelloch},
      year={2025},
      eprint={2502.13245},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2502.13245}, 
}
```

================================================
FILE: docs/algorithms.md
================================================
# Algorithms

The algorithms in this folder share a common main file and thus a common commandline interface. The commandline interface allows the user to build an ANNS graph and write it to an outfile, load a graph and search it, or build and search in one shot. It contains several "generic" parameters that can be repurposed for a new benchmark. In the following examples, we provide instructions for building indices using bash. The instructions assume that the user has downloaded, converted, and built groundtruth for the 100K slice of the BIGANN dataset, as shown in the quickstart instructions. If you want to use range searching, we also provide instructions for computing range groundtruth in `data_tools.md`.

### Universal Parameters

#### Parameters for building:
1. **-graph_outfile** (optional): if graph is not already built, path the graph is written to. This is optional; if not provided, the graph will be built and will print timing and statistics before terminating.
2. **-data_type**: type of the base and query vectors. Currently "float", "int8", and "uint8" are supported.
3. **-dist_func**: the distance function to use when calculating nearest neighbors. Currently Euclidian distance ("euclidian") and maximum inner product search ("mips") are supported.
4. **-base_path**: path to the base file. We only work with files in the .bin format; for your convenience, a converter from the popular .vecs format has been provided in the data tools folder.

#### Parameters for searching:

1. **-gt_path**: path to the ground truth, in .ibin format.
2. **-graph_path** (optional): path to the ANNS graph in the case of using an already built graph.
3. **-query_path**: path to the queries in .bin format.
4. **-res_path** (optional): path where a CSV file of results can be written (it is written to in append form, so it can be used to collect results of multiple runs).
5. **-k** (`long`): the number of nearest neighbors to search for.


### Algorithms

Next we provide some descriptions and example commandline arguments for each algorithm in the implementation.

## Vamana (DiskANN)

Vamana, also known as DiskANN, is an algorithm introduced in [DiskANN: Fast Accurate Billion-point Nearest
Neighbor Search on a Single Node](https://proceedings.neurips.cc/paper_files/paper/2019/file/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf) by Subramanya et al., with original code in the [DiskANN repo](https://github.com/microsoft/DiskANN). It builds a graph incrementally, and its insert procedure does a variant on greedy search or beam search with a frontier size $L$ on the existing graph and uses the nodes visited during the search as edge candidates. The visited nodes are pruned to a list of size $R$ by pruning out points that are likely to become long edges of triangles, with a parameter $a$ that is used to control how aggressive the prune step is. 

1. **R** (`long`): the degree bound.
2. **L** (`long`): the beam width to use when building the graph.
3. **alpha** (`double`): the pruning parameter.
4. **two_pass** (`bool`): optional argument that allows the user to build the graph with two passes or just one (two passes approximately doubles the build time, but provides higher accuracy).

To build a Vamana graph on BIGANN-100K and save it to memory, use the following commandline:

```bash
cd vamana
make
./neighbors -R 32 -L 64 -alpha 1.2 -graph_outfile ../../data/sift/sift_learn_32_64 -data_type float -dist_func Euclidian -base_path ../../data/sift/sift_learn.fbin
```

To load an already built graph and query it, use the following:
```bash
cd vamana
make
./neighbors -R 32 -L 64 -alpha 1.2 -graph_path ../../data/sift/sift_learn_32_64 -query_path ../../data/sift/sift_query.fbin -gt_path ../../data/sift/sift-100K -res_path ../../data/vamana_res.csv -data_type float  -dist_func Euclidian -base_path ../../data/sift/sift_learn.fbin
```

To build, query, and save to memory, use the following:
```bash
cd vamana
make
./neighbors -R 32 -L 64 -alpha 1.2 -graph_outfile ../../data/sift/sift_learn_32_64 -query_path ../../data/sift/sift_query.fbin -gt_path ../../data/sift/sift-100K -res_path ../../data/vamana_res.csv -data_type float -dist_func Euclidian -base_path ../../data/sift/sift_learn.fbin
```

To execute range search using Vamana, use the following commandline. Note that range searching currently does not support exporting data to a CSV file: 

```bash
cd ../rangeSearch/vamanaRange
make
./range -R 32 -L 64 -alpha 1.2 -graph_outfile ../../data/sift/sift_learn_32_64 -query_path ../../data/sift/sift_query.fbin -gt_path ../../data/sift/sift-100K-range -data_type float -dist_func Euclidian -base_path ../../data/sift/sift_learn.fbin
```

## HNSW

HNSW is an algorithm proposed in [Efficient and Robust Approximate Nearest Neighbor Search Using Hierarchical Navigable Small World Graphs](https://dl.acm.org/doi/10.1109/TPAMI.2018.2889473) by Yu et al., of which an implementation is available at [hnswlib](https://github.com/nmslib/hnswlib) and is maintained by the paper authors. The HNSW incrementally builds a hierarchical structure consisting of multiple layers, where each layer is a proximity graphs with the Navigable Small World (NSW) property. The lower layers are always the supersets of the upper ones, and the bottom layer contains all the base points. In the process of constructions, each point is randomly assigned with a height in a logarithmic distribution and repeatedly inserted into all the layers below. As the two points incident to an edge in higher layers has longer distance, the hierarchical structures allows to quickly approach the query point at high layers first and then do fine-grained search at low layers.
Its parameters are as follows:

1. **m** (`long`): the degree bound. Typically between 16 and 64. The graph at the bottom layer (layer0) has the degree bound of $2m$ while graphs at upper layers have degree bound of $m$.
2. **efc** (`long`): the beam width to use when building the graph. Should be set at least $2.5m$, and up to 500.
3. **alpha** (`double`): the pruning parameter. Should be set between 1.0 and 1.15 for similarity measures that are not metrics (e.g. maximum inner product), and between 0.8 and 1.0 for metric spaces. 
4. **ml** (`double`): optional argument to control the number of layers (height). Increasing $ml$ results in more layers which increases the build time but potentially improve the query performance; however, improper settings of $ml$ (too high or too low) can incur much work of query thus impacting the query performance. It should be set around $1/log~m$.

A commandline with suggested parameters for HNSW for the BIGANN-100K dataset is as follows:
```bash
cd HNSW
make
./neighbors -m 20 -efc 50 -alpha 0.9 -ml 0.34 -graph_outfile ../../data/sift/sift_learn_20_50_034 -query_path ../../data/sift/sift_query.fbin -gt_path ../../data/sift/sift-100K -res_path ../../data/hnsw_res.csv -data_type float -dist_func Euclidian -base_path ../../data/sift/sift_learn.fbin
```

## HCNNG

HCNNG is an algorithm taken from [Hierarchical Clustering-Based Graphs for Large Scale Approximate Nearest Neighbor Search](https://www.researchgate.net/publication/334477189_Hierarchical_Clustering-Based_Graphs_for_Large_Scale_Approximate_Nearest_Neighbor_Search) by Munoz et al. and original implemented in [this repository](https://github.com/jalvarm/hcnng). Roughly, it builds a tree by recursively partitioning the data using random partitions until it reaches a leaf size of at most 1000 points, and then builds a bounded-degree MST with the points in each leaf. The edges from the MST are used as the edges in the graph. The algorithm repeats this process a total of $L$ times and merges the edges into the graph on each iteration. Its parameters are as follows:

1. **mst_deg** (`long`): the degree bound of the graph built by each individual cluster tree.
2. **num_clusters** (`long`): the number of cluster trees.
3. **cluster_size** (`long`): the leaf size of each cluster tree.

A commandline with suggested parameters for HCNNG for the BIGANN-100K dataset is as follows:

```bash
cd HCNNG
make
./neighbors -cluster_size 1000 -mst_deg 3 -num_clusters 30  -graph_outfile ../../data/sift/sift_learn_3_10 -query_path ../../data/sift/sift_query.fbin -gt_path ../../data/sift/sift-100K -res_path ../../data/hcnng_res.csv -data_type float -dist_func Euclidian -base_path ../../data/sift/sift_learn.fbin
```

## pyNNDescent

[pyNNDescent](https://pynndescent.readthedocs.io/en/latest/) is an ANNS algorithm by Leland McInnes. It works based on the principle that in a k-nearest neighbor graph, a neighbor of a neighbor is likely to be a neighbor. It finds an approximate nearest neighbor graph by building some number of random clustering trees and calculating exhaustive nearest neighbors at the leaves. Then, it proceeds in rounds, connecting each vertex to the neighbors of each neighbors and keeping the $R$ closest neighbors on each round. After terminating, it prunes out long edges of triangles; in our version, we add a pruning parameter $d$ to control for a denser graph if desired.

1. **R** (`long`): the graph degree bound.
2. **num_clusters** (`long`): the number of cluster trees to use when initializing the graph.
3. **cluster_size** (`long`): the leaf size of the cluster trees.
4. **alpha** (`double`): the pruning parameter for the final pruning step.
5. **delta** (`double`): the early stopping parameter for the nnDescent process.


```bash
cd pyNNDescent
make
./neighbors -R 40 -cluster_size 100 -num_clusters 10 -alpha 1.2 -delta 0.05 -graph_outfile ../../data/sift/sift_learn_30 -query_path ../../data/sift/sift_query.fbin -gt_path ../../data/sift/sift-100K -res_path ../../data/pynn_res.csv -data_type float -dist_func Euclidian -base_path ../../data/sift/sift_learn.fbin
```

## Searching

Each graph is searched using a version of the greedy search/beam search algorithm described in [DiskANN: Fast Accurate Billion-point Nearest Neighbor Search on a Single Node](https://proceedings.neurips.cc/paper_files/paper/2019/file/09853c7fb1d3f8ee67a61b6bf4a7f8e6-Paper.pdf). It also incorporates the optimization suggested in [Pruned Bi-Directed K-Nearest Neighbor Graph for Proximity Search](https://link.springer.com/chapter/10.1007/978-3-319-46759-7_2) of pruning the search frontier when it includes points that are far away from the current $k$-nearest neighbor. Instead of taking in parameters specified by the user, the search routine tries a wide variety of parameter choices and reports those that maximize QPS for a given recall value. The search parameters (see `types.h` in the utils folder) can be tuned if you are developing your own algorithm and are as follows:

1. **Q** (`long`): the beam width. Must be set at least $k$. Controls the number of candidate neighbors retained at any point in the search and is for the most part the chief determinant of accuracy and speed of the search. 
2. **k** (`long`): number of nearest neighbors to search for. 
3. **cut** (`double`): controls pruning the frontier of points that are far away from the current $k$ nearest neighbors. Used only for distance functions that are true metrics (as opposed to similarities that may not obey the triangle inequality, etc.)
4. **visited limit** (`long`): controls the maximum number of vertices visited during the beam search. Used for low accuracy searches; set to the number of vertices in the graph if you don't want any limit.
5. **degree limit** (`long`): controls the maximum number of out-neighbors read when visiting a vertex. Also useful for low accuracy searches. Note that if the out-neighbors are not sorted in order of distance, it does not make sense to use this parameter. 


================================================
FILE: docs/data_tools.md
================================================
# Data Tools

ParlayANN provides various useful tools for manipulating and reading datasets in common formats. For all of the examples below, it is assumed that the BIGANN dataset is downloaded and stored in ParlayANN/data/sift. You can do this using the following commandline:

```bash
mkdir -p data && cd data
wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
tar -xf sift.tar.gz
```

You then need to convert two of the datasets from the .fvecs format to the binary format as follows:

```bash
make vec_to_bin
./vec_to_bin float ../data/sift/sift_learn.fvecs ../data/sift/sift_learn.fbin
./vec_to_bin float ../data/sift/sift_query.fvecs ../data/sift/sift_query.fbin
```

## Compute Groundtruth

ParlayANN supports computing the exact groundtruth for k-nearest neighbors for bin files files. The commandline for computing the groundtruth takes the following parameters:
1. **-base_path**: pointer to the base file, which ground truth will be calculate with respect to.
2. **-query_path**: pointer to the query file, for which the ground truth will be calculated.
3. **-data_type**: type of the query and base files. Current options are "uint8", "int8", and "float".
4. **-k**: the number of nearest neighbors to calculate. Default is 100.
5. **-dist_func**: the distance function to use when computing the ground truth. Current options are "euclidian" for Euclidian distance and "mips" for maximum inner product.
6. **-gt_path**: the path where the new groundtruth file will be written

The following is an example of how to compute the groundtruth for a 100K slice of the BIGANN dataset:

```bash
make compute_groundtruth
./compute_groundtruth -base_path ../data/sift/sift_learn.fbin -query_path ../data/sift/sift_query.fbin -data_type float -k 100 -dist_func Euclidian -gt_path ../data/sift/sift-100K
```

## Compute Range Groundtruth

We also support computing groundtruth for range search, i.e. finding all points in a given radius. The commandline takes the following parameters:
1. **-base_path**: pointer to the base file, which ground truth will be calculate with respect to.
2. **-query_path**: pointer to the query file, for which the ground truth will be calculated.
3. **-data_type**: type of the query and base files. Current options are "uint8", "int8", and "float".
4. **-rad**: the radius for which to calculate the groundtruth.
5. **-dist_func**: the distance function to use when computing the ground truth. Current options are "euclidian" for Euclidian distance and "mips" for maximum inner product.
6. **-gt_path**: the path where the new groundtruth file will be written

An example commandline is as follows:

```bash
make compute_range_groundtruth
./compute_groundtruth -base_path ../data/sift/sift_learn.fbin -query_path ../data/sift/sift_query.fbin -data_type float -rad 5000 -dist_func Euclidian -gt_path ../data/sift/sift-100K-range
```

The range groundtruth is written in binary format in integers. It consists of first the number of datapoints, followed by the total number of range results for the whole dataset, followed by the number of results for each individual point, followed by the result ids. 

## File Conversion

ParlayANN supports converting a .vecs file to a .bin file for vectors with `float`, `uint8`, and `int` coordinates. An example commandline:

```bash
make vec_to_bin
./vec_to_bin float ../data/sift/sift_learn.fvecs ../data/sift/sift_learn.fbin
```

## Cropping

Crop a file to the desired size:

```bash
make crop
./crop ../data/sift/sift_learn.fbin 50000 float ../data/sift/sift_50K.fbin
```

## Random Sampling

Take a random sample of desired size from a file:

```bash
make random_sample
./random_sample ../data/sift/sift_learn.fbin 50000 float ../data/sift/sift_50K_random.fbin
```


================================================
FILE: docs/quickstart.md
================================================
# Quickstart

The following is a crash course in quickly building and querying an index using ParlayANN.

First, download a 100K slice of the BIGANN dataset.

```bash
mkdir -p data && cd data
wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
tar -xf sift.tar.gz
```

Next, convert it from the .fvecs format to binary format:

```bash
cd ../data_tools
make vec_to_bin
./vec_to_bin float ../data/sift/sift_learn.fvecs ../data/sift/sift_learn.fbin
./vec_to_bin float ../data/sift/sift_query.fvecs ../data/sift/sift_query.fbin
```

Next, calculate its ground truth up to $k=100$. See the README in the data_tools folder for an explanation of each parameter.

```bash
cd ../data_tools
make compute_groundtruth
./compute_groundtruth -base_path ../data/sift/sift_learn.fbin -query_path ../data/sift/sift_query.fbin -data_type float -k 100 -dist_func Euclidian -gt_path ../data/sift/sift-100K
```

To build an index using Vamana and write it to an outfile, use the following commandline:
```bash
cd ../algorithms/vamana
make
./neighbors -R 32 -L 64 -alpha 1.2 two_pass 0 -graph_outfile ../../data/sift/sift_learn_32_64 -data_type float -dist_func Euclidian -base_path ../../data/sift/sift_learn.fbin
```

You should see the following output; timing will vary based on your machine and these times are taken from a machine with 72 cores:

```bash
Detected 100000 points with dimension 128
Building graph...
Pass 10% complete
Pass 20% complete
Pass 30% complete
Pass 40% complete
Pass 50% complete
Pass 60% complete
Pass 70% complete
Pass 80% complete
Pass 90% complete
Pass 100% complete
beam search time: total: 0.3436
bidirect time: total: 0.0557
prune time: total: 0.3751
Average visited: 68, Tail visited: 76
Vamana graph built with 100000 points and parameters R = 32, L = 64
Graph has average degree 26.94 and maximum degree 32
Graph built in 0.8123 seconds
Parlay time: 0.8138
Writing graph with 100000 points and max degree 32
```

To load and then query the index, use the following command:
```bash
cd ../algorithms/vamana
make
./neighbors -R 32 -L 64 -a 1.2 -graph_path ../../data/sift/sift_learn_32_64 -query_path ../../data/sift/sift_query.fbin -gt_path ../../data/sift/sift-100K -data_type float -dist_func Euclidian -base_path ../../data/sift/sift_learn.fbin
```

You should see an output similar to the following; timings were taken using a machine with 72 cores.

```bash
Detected 10000 points with num results 100
Detected 100000 points with dimension 128
Detected 10000 points with dimension 128
Detected 100000 points with max degree 32
Average visited: 0, Tail visited: 0
Vamana graph built with 100000 points and parameters R = 32, L = 64
Graph has average degree 26.9416 and maximum degree 32
Graph built in 0 seconds
For 10@10 recall = 0.11027, QPS = 5.05561e+06, Q = 10, cut = 1.35, visited limit = 6, degree limit: 16, average visited = 6, average cmps = 89
For 10@10 recall = 0.2032, QPS = 4.40141e+06, Q = 10, cut = 1.35, visited limit = 8, degree limit: 16, average visited = 8, average cmps = 115
For 10@10 recall = 0.36414, QPS = 2.58598e+06, Q = 10, cut = 1.35, visited limit = 10, degree limit: 19, average visited = 10, average cmps = 169
For 10@10 recall = 0.45495, QPS = 2.68528e+06, Q = 10, cut = 1.35, visited limit = 10, degree limit: 22, average visited = 10, average cmps = 195
For 10@10 recall = 0.59687, QPS = 2.35627e+06, Q = 10, cut = 1.35, visited limit = 10, degree limit: 25, average visited = 10, average cmps = 219
For 10@10 recall = 0.61139, QPS = 2.07857e+06, Q = 13, cut = 1.35, visited limit = 13, degree limit: 22, average visited = 13, average cmps = 245
For 10@10 recall = 0.7209, QPS = 1.86532e+06, Q = 11, cut = 1.35, visited limit = 11, degree limit: 28, average visited = 11, average cmps = 261
For 10@10 recall = 0.75036, QPS = 1.90006e+06, Q = 13, cut = 1.35, visited limit = 13, degree limit: 25, average visited = 13, average cmps = 274
For 10@10 recall = 0.81667, QPS = 1.80538e+06, Q = 15, cut = 1.35, visited limit = 15, degree limit: 25, average visited = 15, average cmps = 310
For 10@10 recall = 0.86956, QPS = 1.55304e+06, Q = 15, cut = 1.35, visited limit = 15, degree limit: 28, average visited = 15, average cmps = 339
For 10@10 recall = 0.92219, QPS = 1.4652e+06, Q = 15, cut = 1.35, visited limit = 15, degree limit: 32, average visited = 15, average cmps = 372
For 10@10 recall = 0.95779, QPS = 1.15009e+06, Q = 12, cut = 1.35, visited limit = 100000, degree limit: 32, average visited = 18, average cmps = 436
For 10@10 recall = 0.97133, QPS = 955658, Q = 17, cut = 1.35, visited limit = 100000, degree limit: 32, average visited = 22, average cmps = 529
For 10@10 recall = 0.98078, QPS = 775014, Q = 24, cut = 1.35, visited limit = 100000, degree limit: 32, average visited = 29, average cmps = 656
For 10@10 recall = 0.99151, QPS = 473530, Q = 45, cut = 1.35, visited limit = 100000, degree limit: 32, average visited = 49, average cmps = 1026
For 10@10 recall = 0.99509, QPS = 351296, Q = 70, cut = 1.35, visited limit = 100000, degree limit: 32, average visited = 71, average cmps = 1356
For 10@10 recall = 0.99912, QPS = 186532, Q = 180, cut = 1.35, visited limit = 100000, degree limit: 32, average visited = 145, average cmps = 2279
For 10@10 recall = 0.9995, QPS = 151930, Q = 250, cut = 1.35, visited limit = 100000, degree limit: 32, average visited = 174, average cmps = 2546
For 10@10 recall = 0.99995, QPS = 13560.6, Q = 1000, cut = 10, visited limit = 100000, degree limit: 32, average visited = 1003, average cmps = 7885
For 10@10 recall = 0.99995, QPS = 13560.6, Q = 1000, cut = 10, visited limit = 100000, degree limit: 32, average visited = 1003, average cmps = 7885
```


================================================
FILE: docs/rangesearch.md
================================================
# Range Search 

Range search is defined as finding every point within a specified radius of a query point with respect to some dataset. This repository contains the algorithms introduced in the paper [Range Retrieval with Graph-Based Indices](https://arxiv.org/abs/2502.13245).

## Sample commandline and parameters

Range groundtruth file should be computed before running these commands. These tools are provided in data_tools library. For further explanation, see [Data Tools](https://cmuparlay.github.io/ParlayANN/data_tools)

An example commandline for generating range ground truth is shown below. This example is also explained in the [Quickstart](https://cmuparlay.github.io/ParlayANN/quickstart) guide. In this case, the **SIFT dataset** refers to the BIGANN dataset, as described in the [Quickstart](https://cmuparlay.github.io/ParlayANN/quickstart).


```
cd ../data_tools
make compute_range_groundtruth
./compute_range_groundtruth -base_path ../data/sift/sift_learn.fbin -query_path ../data/sift/sift_query.fbin -data_type float -k 100 -dist_func Euclidian -gt_path ../data/sift/sift-100K
```

To run a range search on sift run:
```
cd rangeSearch/vamanaRange
R=../../data/sift
make
./range  -alpha 1.15 -R 64 -L 128 -r 10000 -base_path $R/sift_learn.fbin -data_type uint8 -dist_func Euclidian -query_path $R/sift_query.fbin  -gt_path $R/range_gt_1M_10000 -search_mode beamSearch -early_stop -graph_path $R/graph1M  -early_stopping_radius 30000
```

All other parameters are same as in  [Algorithms](https://cmuparlay.github.io/ParlayANN/algorithms). Here we add descriptions for parameters that are new.

1. **-r**(`double`): Range search radius
2. **-search_mode**(`string`): The search mode to use can be specified. Possible options are ['doubling', 'greedy', 'beam'], corresponding to the three algorithms introduced in our paper. The default option is beam search.
3. **-early_stop**(optional): Flag for early stopping. With this flag on, range search would stop early based on early stopping radius.
4. **-early_stopping_radius**(`double`): Radius for early stopping. Typically larger than the range search radius.


================================================
FILE: python/__init__.py
================================================
# // This code is part of the Problem Based Benchmark Suite (PBBS)
# // Copyright (c) 2011 Guy Blelloch and the PBBS team
# //
# // Permission is hereby granted, free of charge, to any person obtaining a
# // copy of this software and associated documentation files (the
# // "Software"), to deal in the Software without restriction, including
# // without limitation the rights (to use, copy, modify, merge, publish,
# // distribute, sublicense, and/or sell copies of the Software, and to
# // permit persons to whom the Software is furnished to do so, subject to
# // the following conditions:
# //
# // The above copyright notice and this permission notice shall be included
# // in all copies or substantial portions of the Software.
# //
# // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

"""
# Documentation Overview
`ParlayANNpy` is mostly structured around 2 distinct processes: [Index Builder Functions](#index-builders) and [Search Classes](#search-classes)
This code is adapted from the DiskANN pybindings and heavily draws on their implementation.

It also includes a few nascent [utilities](#utilities).

And lastly, it makes substantial use of type hints, with various shorthand [type aliases](#parameter-and-response-type-aliases) documented. 
When reading the `ParlayANNpy` code we refer to the type aliases.

## Index Builders
- `build_vamana_index` - Builds an in-memory Vamana index

## Search Classes
- `VamanaIndex` - fully in memory and static

## Parameter Defaults
- `ParlayANNpy.defaults` - Default values exported from the C++ extension for Python users

## Parameter and Response Type Aliases
- `DistanceMetric` - What distance metrics does `ParlayANNpy` support?
- `VectorDType` - What vector datatypes does `ParlayANNpy` support?
- `QueryResponse` - What can I expect as a response to my search?
- `QueryResponseBatch` - What can I expect as a response to my batch search?
- `VectorIdentifier` - What types do `ParlayANNpy` support as vector identifiers?
- `VectorIdentifierBatch` - A batch of identifiers of the exact same type. The type can change, but they must **all** change.
- `VectorLike` - How does a vector look to `ParlayANNpy`, to be inserted or searched with.
- `VectorLikeBatch` - A batch of those vectors, to be inserted or searched with.
- `Metadata` - DiskANN vector binary file metadata (num_points, vector_dim)

## Utilities
- `vectors_to_file` - Turns a 2 dimensional `numpy.typing.NDArray[VectorDType]` with shape `(number_of_points, vector_dim)` into a ParlayANN vector bin file.
- `vectors_from_file` - Reads a ParlayANN vector bin file representing stored vectors into a numpy ndarray.
- `vectors_metadata_from_file` - Reads metadata stored in a ParlayANN vector bin file without reading the entire file
- `valid_dtype` - Checks if a given vector dtype is supported by `ParlayANNpy`
"""

from typing import Any, Literal, NamedTuple, Type, Union

import numpy as np
from numpy import typing as npt

DistanceMetric = Literal["Euclidian", "mips"]
""" Type alias for one of {"l2", "mips",} """
VectorDType = Union[Type[np.float32], Type[np.int8], Type[np.uint8]]
""" Type alias for one of {`numpy.float32`, `numpy.int8`, `numpy.uint8`} """
VectorLike = npt.NDArray[VectorDType]
""" Type alias for something that can be treated as a vector """
VectorLikeBatch = npt.NDArray[VectorDType]
""" Type alias for a batch of VectorLikes """
VectorIdentifier = np.uint32
""" 
Type alias for a vector identifier, whether it be an implicit array index identifier from StaticMemoryIndex or 
StaticDiskIndex, or an explicit tag identifier from DynamicMemoryIndex 
"""
VectorIdentifierBatch = npt.NDArray[np.uint32]
""" Type alias for a batch of VectorIdentifiers """


class QueryResponse(NamedTuple):
    """
    Tuple with two values, identifiers and distances. Both are 1d arrays, positionally correspond, and will contain the
    nearest neighbors from [0..k_neighbors)
    """

    identifiers: npt.NDArray[VectorIdentifier]
    """ A `numpy.typing.NDArray[VectorIdentifier]` array of vector identifiers, 1 dimensional """
    distances: npt.NDArray[np.float32]
    """
    A `numpy.typing.NDAarray[numpy.float32]` of distances as calculated by the distance metric function,  1 dimensional
    """


class QueryResponseBatch(NamedTuple):
    """
    Tuple with two values, identifiers and distances. Both are 2d arrays, with dimensionality determined by the
    rows corresponding to the number of queries made, and the columns corresponding to the k neighbors
    requested. The two 2d arrays have an implicit, position-based relationship
    """

    identifiers: npt.NDArray[VectorIdentifier]
    """ 
    A `numpy.typing.NDArray[VectorIdentifier]` array of vector identifiers, 2 dimensional. The row corresponds to index 
    of the query, and the column corresponds to the k neighbors requested 
    """
    distances: np.ndarray[np.float32]
    """  
    A `numpy.typing.NDAarray[numpy.float32]` of distances as calculated by the distance metric function, 2 dimensional. 
    The row corresponds to the index of the query, and the column corresponds to the distance of the query to the 
    *k-th* neighbor 
    """


from . import defaults
from ._builder import build_vamana_index
from ._builder import build_hcnng_index
from ._common import valid_dtype
# TODO implement searching once index build works
# from ._dynamic_memory_index import DynamicMemoryIndex
from ._files import (
    Metadata,
    vectors_from_file,
    vectors_metadata_from_file,
    vectors_to_file,
)
# from ._static_disk_index import StaticDiskIndex
# from ._static_memory_index import StaticMemoryIndex

__all__ = [
    "build_vamana_index",
    "build_hcnng_index",
    # "StaticDiskIndex", //TODO add back search index
    "defaults",
    "DistanceMetric",
    "VectorDType",
    "QueryResponse",
    "QueryResponseBatch",
    "VectorIdentifier",
    "VectorIdentifierBatch",
    "VectorLike",
    "VectorLikeBatch",
    "Metadata",
    "vectors_metadata_from_file",
    "vectors_to_file",
    "vectors_from_file",
    "valid_dtype",
]

================================================
FILE: python/_builder.py
================================================
# // This code is part of the Problem Based Benchmark Suite (PBBS)
# // Copyright (c) 2011 Guy Blelloch and the PBBS team
# //
# // Permission is hereby granted, free of charge, to any person obtaining a
# // copy of this software and associated documentation files (the
# // "Software"), to deal in the Software without restriction, including
# // without limitation the rights (to use, copy, modify, merge, publish,
# // distribute, sublicense, and/or sell copies of the Software, and to
# // permit persons to whom the Software is furnished to do so, subject to
# // the following conditions:
# //
# // The above copyright notice and this permission notice shall be included
# // in all copies or substantial portions of the Software.
# //
# // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

import os
import shutil
from pathlib import Path
from typing import Optional, Tuple, Union

import numpy as np

from . import VectorDType, VectorIdentifierBatch, VectorLikeBatch
from . import _ParlayANNpy as _parlayann
from ._common import (
    _assert,
    _assert_is_nonnegative_uint32,
    _assert_is_positive_uint32,
    _castable_dtype_or_raise,
    _valid_metric,
    _write_index_metadata,
    valid_dtype,
)
from ._ParlayANNpy import defaults
from ._files import tags_to_file, vectors_metadata_from_file, vectors_to_file


def _valid_path_and_dtype(
    data: Union[str, VectorLikeBatch],
    vector_dtype: VectorDType,
    index_path: str,
    index_prefix: str,
) -> Tuple[str, VectorDType]:
    if isinstance(data, str):
        vector_bin_path = data
        _assert(
            Path(data).exists() and Path(data).is_file(),
            "if data is of type `str`, it must both exist and be a file",
        )
        vector_dtype_actual = valid_dtype(vector_dtype)
    else:
        vector_bin_path = os.path.join(index_path, f"{index_prefix}_vectors.bin")
        if Path(vector_bin_path).exists():
            raise ValueError(
                f"The path {vector_bin_path} already exists. Remove it and try again."
            )
        vector_dtype_actual = valid_dtype(data.dtype)
        vectors_to_file(vector_file=vector_bin_path, vectors=data)

    return vector_bin_path, vector_dtype_actual


def build_memory_index(
    data: Union[str, VectorLikeBatch],
    distance_metric: str,
    index_directory: str,
    beam_width: int,
    graph_degree: int,
    alpha: float = defaults.ALPHA,
    vector_dtype: Optional[VectorDType] = None,
    index_prefix: str = "ann",
) -> None:
    _assert(
        (isinstance(data, str) and vector_dtype is not None)
        or isinstance(data, np.ndarray),
        "vector_dtype is required if data is a str representing a path to the vector bin file",
    )
    dap_metric = _valid_metric(distance_metric)
    _assert_is_positive_uint32(complexity, "complexity")
    _assert_is_positive_uint32(graph_degree, "graph_degree")
    _assert(
        alpha >= 1,
        "alpha must be >= 1, and realistically should be kept between [1.0, 2.0)",
    )
    _assert_is_nonnegative_uint32(num_threads, "num_threads")
    _assert_is_nonnegative_uint32(num_pq_bytes, "num_pq_bytes")
    _assert_is_nonnegative_uint32(filter_complexity, "filter_complexity")
    _assert(index_prefix != "", "index_prefix cannot be an empty string")

    index_path = Path(index_directory)
    _assert(
        index_path.exists() and index_path.is_dir(),
        "index_directory must both exist and be a directory",
    )

    vector_bin_path, vector_dtype_actual = _valid_path_and_dtype(
        data, vector_dtype, index_directory, index_prefix
    )

    num_points, dimensions = vectors_metadata_from_file(vector_bin_path)

    if vector_dtype_actual == np.uint8:
        _builder = _parlayann.build_memory_uint8_index
    elif vector_dtype_actual == np.int8:
        _builder = _parlayann.build_memory_int8_index
    else:
        _builder = _parlayann.build_memory_float_index

    index_prefix_path = os.path.join(index_directory, index_prefix)

    _builder(
        distance_metric=dap_metric,
        data_file_path=vector_bin_path,
        index_output_path=index_prefix_path,
        beam_width=beam_width,
        graph_degree=graph_degree,
        alpha=alpha,
    )

    _write_index_metadata(
        index_prefix_path, vector_dtype_actual, dap_metric, num_points, dimensions
    )

================================================
FILE: python/_builder.pyi
================================================
# // This code is part of the Problem Based Benchmark Suite (PBBS)
# // Copyright (c) 2011 Guy Blelloch and the PBBS team
# //
# // Permission is hereby granted, free of charge, to any person obtaining a
# // copy of this software and associated documentation files (the
# // "Software"), to deal in the Software without restriction, including
# // without limitation the rights (to use, copy, modify, merge, publish,
# // distribute, sublicense, and/or sell copies of the Software, and to
# // permit persons to whom the Software is furnished to do so, subject to
# // the following conditions:
# //
# // The above copyright notice and this permission notice shall be included
# // in all copies or substantial portions of the Software.
# //
# // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

from typing import BinaryIO, Optional, overload

import numpy as np

from . import VectorDType, VectorIdentifierBatch, VectorLikeBatch

def numpy_to_diskann_file(vectors: np.ndarray, file_handler: BinaryIO): ...
def build_vamana_index(
    data: str,
    distance_metric: str,
    index_directory: str,
    beam_width: int,
    graph_degree: int,
    alpha: float,
    two_pass: bool,
    vector_dtype: VectorDType,
    index_prefix: str,
) -> None: ...
def build_hcnng_index(
    data: str,
    distance_metric: str,
    index_directory: str,
    mst_deg: int,
    num_clusters: int,
    cluster_size: float,
    vector_dtype: VectorDType,
    index_prefix: str,
) -> None: ...
def build_pynndescent_index(
    data: str,
    distance_metric: str,
    index_directory: str,
    max_deg: int,
    num_clusters: int,
    cluster_size: float,
    alpha: float,
    delta: float,
    vector_dtype: VectorDType,
    index_prefix: str,
) -> None: ...


================================================
FILE: python/_common.py
================================================
# // This code is part of the Problem Based Benchmark Suite (PBBS)
# // Copyright (c) 2011 Guy Blelloch and the PBBS team
# //
# // Permission is hereby granted, free of charge, to any person obtaining a
# // copy of this software and associated documentation files (the
# // "Software"), to deal in the Software without restriction, including
# // without limitation the rights (to use, copy, modify, merge, publish,
# // distribute, sublicense, and/or sell copies of the Software, and to
# // permit persons to whom the Software is furnished to do so, subject to
# // the following conditions:
# //
# // The above copyright notice and this permission notice shall be included
# // in all copies or substantial portions of the Software.
# //
# // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

import os
import warnings
from enum import Enum
from pathlib import Path
from typing import Literal, NamedTuple, Optional, Tuple, Type, Union

import numpy as np

from . import (
    DistanceMetric,
    VectorDType,
    VectorIdentifierBatch,
    VectorLike,
    VectorLikeBatch,
)
from . import _ParlayANNpy as _parlayann

__ALL__ = ["valid_dtype"]

_VALID_DTYPES = [np.float32, np.int8, np.uint8]


def valid_dtype(dtype: Type) -> VectorDType:
    """
    Utility method to determine whether the provided dtype is supported by `diskannpy`, and if so, the canonical
    dtype we will use internally (e.g. np.single -> np.float32)
    """
    _assert_dtype(dtype)
    if dtype == np.uint8:
        return np.uint8
    if dtype == np.int8:
        return np.int8
    if dtype == np.float32:
        return np.float32


def _assert(statement_eval: bool, message: str):
    if not statement_eval:
        raise ValueError(message)


def _valid_metric(metric: str) -> str:
    if not isinstance(metric, str):
        raise ValueError("distance_metric must be a string")
    if metric.lower() == "Euclidian":
        return "Euclidian"
    elif metric.lower() == "mips":
        return "mips"
    else:
        raise ValueError("distance_metric must be one of 'l2', 'mips', or 'cosine'")


def _assert_dtype(dtype: Type):
    _assert(
        any(np.can_cast(dtype, _dtype) for _dtype in _VALID_DTYPES),
        f"Vector dtype must be of one of type {{(np.single, np.float32), (np.byte, np.int8), (np.ubyte, np.uint8)}}",
    )


def _castable_dtype_or_raise(
    data: Union[VectorLike, VectorLikeBatch, VectorIdentifierBatch], expected: np.dtype
) -> np.ndarray:
    if isinstance(data, np.ndarray) and np.can_cast(data.dtype, expected):
        return data.astype(expected, casting="safe")
    else:
        raise TypeError(
            f"expecting a numpy ndarray of dtype {expected}, not a {type(data)}"
        )


def _assert_2d(vectors: np.ndarray, name: str):
    _assert(len(vectors.shape) == 2, f"{name} must be 2d numpy array")


__MAX_UINT32_VAL = 4_294_967_295


def _assert_is_positive_uint32(test_value: int, parameter: str):
    _assert(
        test_value is not None and 0 < test_value < __MAX_UINT32_VAL,
        f"{parameter} must be a positive integer in the uint32 range",
    )


def _assert_is_nonnegative_uint32(test_value: int, parameter: str):
    _assert(
        test_value is not None and -1 < test_value < __MAX_UINT32_VAL,
        f"{parameter} must be a non-negative integer in the uint32 range",
    )


def _assert_is_nonnegative_uint64(test_value: int, parameter: str):
    _assert(
        -1 < test_value,
        f"{parameter} must be a non-negative integer in the uint64 range",
    )


def _assert_existing_directory(path: str, parameter: str):
    _path = Path(path)
    _assert(
        _path.exists() and _path.is_dir(), f"{parameter} must be an existing directory"
    )


def _assert_existing_file(path: str, parameter: str):
    _path = Path(path)
    _assert(_path.exists() and _path.is_file(), f"{parameter} must be an existing file")


class _DataType(Enum):
    FLOAT32 = 0
    INT8 = 1
    UINT8 = 2

    @classmethod
    def from_type(cls, vector_dtype: VectorDType) -> "DataType":
        if vector_dtype == np.float32:
            return cls.FLOAT32
        if vector_dtype == np.int8:
            return cls.INT8
        if vector_dtype == np.uint8:
            return cls.UINT8

    def to_type(self) -> VectorDType:
        if self is _DataType.FLOAT32:
            return np.float32
        if self is _DataType.INT8:
            return np.int8
        if self is _DataType.UINT8:
            return np.uint8


def _build_metadata_path(index_path_and_prefix: str) -> str:
    return index_path_and_prefix + "_metadata.bin"


def _write_index_metadata(
    index_path_and_prefix: str,
    dtype: VectorDType,
    metric: str,
    num_points: int,
    dimensions: int,
):
    np.array(
        [
            _DataType.from_type(dtype).value,
            metric,
            num_points,
            dimensions,
        ],
        dtype=np.uint64,
    ).tofile(_build_metadata_path(index_path_and_prefix))


def _read_index_metadata(
    index_path_and_prefix: str,
) -> Optional[Tuple[VectorDType, str, np.uint64, np.uint64]]:
    path = _build_metadata_path(index_path_and_prefix)
    if not Path(path).exists():
        return None
    else:
        metadata = np.fromfile(path, dtype=np.uint64, count=-1)
        return (
            _DataType(int(metadata[0])).to_type(),
            int(metadata[1]).to_str(),
            metadata[2],
            metadata[3],
        )


def _ensure_index_metadata(
    index_path_and_prefix: str,
    vector_dtype: Optional[VectorDType],
    distance_metric: Optional[str],
    max_vectors: int,
    dimensions: Optional[int],
) -> Tuple[VectorDType, str, np.uint64, np.uint64]:
    possible_metadata = _read_index_metadata(index_path_and_prefix)
    if possible_metadata is None:
        _assert(
            all([vector_dtype, distance_metric, dimensions]),
            "distance_metric, vector_dtype, and dimensions must provided if a corresponding metadata file has not "
            "been built for this index, such as when an index was built via the CLI tools or prior to the addition "
            "of a metadata file",
        )
        _assert_dtype(vector_dtype)
        _assert_is_positive_uint32(max_vectors, "max_vectors")
        _assert_is_positive_uint32(dimensions, "dimensions")
        return vector_dtype, distance_metric, max_vectors, dimensions  # type: ignore
    else:
        vector_dtype, distance_metric, num_vectors, dimensions = possible_metadata
        if max_vectors is not None and num_vectors > max_vectors:
            warnings.warn(
                "The number of vectors in the saved index exceeds the max_vectors parameter. "
                "max_vectors is being adjusted to accommodate the dataset, but any insertions will fail."
            )
            max_vectors = num_vectors
        if num_vectors == max_vectors:
            warnings.warn(
                "The number of vectors in the saved index equals max_vectors parameter. Any insertions will fail."
            )
        return possible_metadata


def _valid_index_prefix(index_directory: str, index_prefix: str) -> str:
    _assert(
        index_directory is not None and index_directory != "",
        "index_directory cannot be None or empty",
    )
    _assert_existing_directory(index_directory, "index_directory")
    _assert(index_prefix != "", "index_prefix cannot be an empty string")
    return os.path.join(index_directory, index_prefix)

================================================
FILE: python/_files.py
================================================
# // This code is part of the Problem Based Benchmark Suite (PBBS)
# // Copyright (c) 2011 Guy Blelloch and the PBBS team
# //
# // Permission is hereby granted, free of charge, to any person obtaining a
# // copy of this software and associated documentation files (the
# // "Software"), to deal in the Software without restriction, including
# // without limitation the rights (to use, copy, modify, merge, publish,
# // distribute, sublicense, and/or sell copies of the Software, and to
# // permit persons to whom the Software is furnished to do so, subject to
# // the following conditions:
# //
# // The above copyright notice and this permission notice shall be included
# // in all copies or substantial portions of the Software.
# //
# // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

import warnings
from typing import BinaryIO, NamedTuple

import numpy as np
import numpy.typing as npt

from . import VectorDType, VectorIdentifierBatch, VectorLikeBatch
from ._common import _assert, _assert_2d, _assert_dtype, _assert_existing_file


class Metadata(NamedTuple):
    """DiskANN binary vector files contain a small stanza containing some metadata about them."""

    num_vectors: int
    """ The number of vectors in the file. """
    dimensions: int
    """ The dimensionality of the vectors in the file. """


def vectors_metadata_from_file(vector_file: str) -> Metadata:
    """
    Read the metadata from a DiskANN binary vector file.
    ### Parameters
    - **vector_file**: The path to the vector file to read the metadata from.

    ### Returns
    `diskannpy.Metadata`
    """
    _assert_existing_file(vector_file, "vector_file")
    points, dims = np.fromfile(file=vector_file, dtype=np.int32, count=2)
    return Metadata(points, dims)


def _write_bin(data: np.ndarray, file_handler: BinaryIO):
    if len(data.shape) == 1:
        _ = file_handler.write(np.array([data.shape[0], 1], dtype=np.int32).tobytes())
    else:
        _ = file_handler.write(np.array(data.shape, dtype=np.int32).tobytes())
    _ = file_handler.write(data.tobytes())


def vectors_to_file(vector_file: str, vectors: VectorLikeBatch) -> None:
    """
    Utility function that writes a DiskANN binary vector formatted file to the location of your choosing.

    ### Parameters
    - **vector_file**: The path to the vector file to write the vectors to.
    - **vectors**: A 2d array of dtype `numpy.float32`, `numpy.uint8`, or `numpy.int8`
    """
    _assert_dtype(vectors.dtype)
    _assert_2d(vectors, "vectors")
    with open(vector_file, "wb") as fh:
        _write_bin(vectors, fh)


def vectors_from_file(vector_file: str, dtype: VectorDType) -> npt.NDArray[VectorDType]:
    """
    Read vectors from a DiskANN binary vector file.

    ### Parameters
    - **vector_file**: The path to the vector file to read the vectors from.
    - **dtype**: The data type of the vectors in the file. Ensure you match the data types exactly

    ### Returns
    `numpy.typing.NDArray[dtype]`
    """
    points, dims = vectors_metadata_from_file(vector_file)
    return np.fromfile(file=vector_file, dtype=dtype, offset=8).reshape(points, dims)


================================================
FILE: python/big_env.yml
================================================
name: bigann
channels:
  - conda-forge
  - defaults
dependencies:
  - _libgcc_mutex=0.1=conda_forge
  - _openmp_mutex=4.5=2_kmp_llvm
  - asttokens=2.0.5=pyhd3eb1b0_0
  - backcall=0.2.0=pyhd3eb1b0_0
  - bzip2=1.0.8=h7b6447c_0
  - ca-certificates=2023.7.22=hbcca054_0
  - colorama=0.4.6=pyhd8ed1ab_0
  - comm=0.1.2=py310h06a4308_0
  - debugpy=1.6.7=py310h6a678d5_0
  - decorator=5.1.1=pyhd3eb1b0_0
  - executing=0.8.3=pyhd3eb1b0_0
  - faiss=1.7.4=py310h9ed8947_0_cpu
  - gmp=6.2.1=h58526e2_0
  - gmpy2=2.1.2=py310h3ec546c_1
  - ipykernel=6.19.2=py310h2f386ee_0
  - ipython=8.12.0=py310h06a4308_0
  - jedi=0.18.1=py310h06a4308_1
  - jupyter_client=8.1.0=py310h06a4308_0
  - jupyter_core=5.3.0=py310h06a4308_0
  - ld_impl_linux-64=2.38=h1181459_1
  - libblas=3.9.0=17_linux64_openblas
  - libcblas=3.9.0=17_linux64_openblas
  - libfaiss=1.7.4=hf47d654_0_cpu
  - libfaiss-avx2=1.7.4=h1234567_0_cpu
  - libffi=3.4.4=h6a678d5_0
  - libgcc-ng=13.1.0=he5830b7_0
  - libgfortran-ng=13.1.0=h69a702a_0
  - libgfortran5=13.1.0=h15d22d2_0
  - liblapack=3.9.0=17_linux64_openblas
  - libopenblas=0.3.23=pthreads_h80387f5_0
  - libsodium=1.0.18=h7b6447c_0
  - libstdcxx-ng=13.1.0=hfd8a6a1_0
  - libuuid=1.41.5=h5eee18b_0
  - llvm-openmp=14.0.6=h9e868ea_0
  - matplotlib-inline=0.1.6=py310h06a4308_0
  - mpc=1.3.1=hfe3b2da_0
  - mpfr=4.2.0=hb012696_0
  - ncurses=6.4=h6a678d5_0
  - nest-asyncio=1.5.6=py310h06a4308_0
  - openssl=3.1.2=hd590300_0
  - parso=0.8.3=pyhd3eb1b0_0
  - pexpect=4.8.0=pyhd3eb1b0_3
  - pickleshare=0.7.5=pyhd3eb1b0_1003
  - pip=23.2.1=py310h06a4308_0
  - platformdirs=2.5.2=py310h06a4308_0
  - prompt-toolkit=3.0.36=py310h06a4308_0
  - ptyprocess=0.7.0=pyhd3eb1b0_2
  - pure_eval=0.2.2=pyhd3eb1b0_0
  - pybind11=2.11.1=py310hd41b1e2_0
  - pybind11-global=2.11.1=py310hd41b1e2_0
  - pygments=2.15.1=py310h06a4308_1
  - python=3.10.12=h955ad1f_0
  - python-dateutil=2.8.2=pyhd3eb1b0_0
  - python_abi=3.10=2_cp310
  - pyzmq=25.1.0=py310h6a678d5_0
  - readline=8.2=h5eee18b_0
  - setuptools=68.0.0=py310h06a4308_0
  - six=1.16.0=pyhd3eb1b0_1
  - sqlite=3.41.2=h5eee18b_0
  - stack_data=0.2.0=pyhd3eb1b0_0
  - tk=8.6.12=h1ccaba5_0
  - tornado=6.3.2=py310h5eee18b_0
  - tqdm=4.66.1=pyhd8ed1ab_0
  - traitlets=5.7.1=py310h06a4308_0
  - wcwidth=0.2.5=pyhd3eb1b0_0
  - wheel=0.38.4=py310h06a4308_0
  - xz=5.4.2=h5eee18b_0
  - zeromq=4.3.4=h2531618_0
  - zlib=1.2.13=h5eee18b_0
  - pip:
      - ansicolors==1.1.8
      - certifi==2023.7.22
      - charset-normalizer==3.2.0
      - cycler==0.11.0
      - docker==6.1.2
      - h5py==3.8.0
      - idna==3.4
      - jinja2==3.1.2
      - joblib==1.3.1
      - kiwisolver==1.4.4
      - markupsafe==2.1.3
      - matplotlib==3.3.4
      - numpy==1.24.2
      - packaging==23.1
      - pandas==2.0.0
      - pillow==10.0.0
      - psutil==5.9.4
      - pyparsing==3.1.1
      - pytz==2023.3
      - pyyaml==6.0
      - requests==2.31.0
      - scikit-learn==1.3.0
      - scipy==1.10.1
      - threadpoolctl==3.2.0
      - tzdata==2023.3
      - urllib3==2.0.4
      - websocket-client==1.6.1
prefix: /home/ben/miniconda3/envs/bigann


================================================
FILE: python/builder.cpp
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include "../algorithms/vamana/index.h"
#include "../algorithms/HCNNG/hcnng_index.h"
#include "../algorithms/pyNNDescent/pynn_index.h"
#include "../algorithms/HNSW/HNSW.hpp"
#include "../algorithms/utils/types.h"
#include "../algorithms/utils/point_range.h"
#include "../algorithms/utils/graph.h"
#include "../algorithms/utils/euclidian_point.h"
#include "../algorithms/utils/mips_point.h"
#include "../algorithms/utils/stats.h"

using namespace parlayANN;

template <typename T, typename Point>
void build_vamana_index(std::string metric, std::string &vector_bin_path,
                        std::string &index_output_path, uint32_t graph_degree, uint32_t beam_width,
                        float alpha, bool two_pass)
{
  //use file parsers to create Point object

  using Range = PointRange<Point>;
  Range* Points = new Range(vector_bin_path.data());
  if (!Point::is_metric()) { // normalize if not a metric
    std::cout << "normalizing" << std::endl;
    for (int i=0; i < Points->size(); i++) 
      (*Points)[i].normalize();
    if (Points->dimension() <= 200) {
      if (Points->dimension() < 100) {
        std::cout << "Setting alpha to 1.0 because dimensionality is " << Points->dimension() << " (< 100)" << std::endl;
        alpha = 1.0;
      } else {
        std::cout << "Setting alpha to 0.98 because dimensionality is " << Points->dimension() << " (>= 100 & <= 200)" << std::endl;
        alpha = .98;
        };
    }
  }

  //instantiate build params and stats objects
  BuildParams BP(graph_degree, beam_width, alpha, two_pass ? 2 : 1);
  stats<unsigned int> BuildStats(Points->size());

  if (sizeof(typename Range::Point::T) > 1) {
    if (Point::is_metric()) {
      using QuantT = uint8_t;
      using QuantPoint = Euclidian_Point<QuantT>;
      using QuantRange = PointRange<QuantPoint>;
      QuantRange Quant_Points(*Points);  // quantized to one byte
      delete Points; // remove original points
      Graph<unsigned int> G = Graph<unsigned int>(graph_degree, Points->size());

      //call the build function
      using index = knn_index<QuantRange, QuantRange, unsigned int>;
      index I(BP);
      I.build_index(G, Quant_Points, Quant_Points, BuildStats);
      G.save(index_output_path.data());
    } else {
      using QuantT = int8_t;
      using QuantPoint = Quantized_Mips_Point<8, true>;
      using QuantRange = PointRange<QuantPoint>;
      QuantRange Quant_Points(*Points);  // quantized to one byte
      delete Points;  // remove original points
      Graph<unsigned int> G = Graph<unsigned int>(graph_degree, Points->size());

      //call the build function
      using index = knn_index<QuantRange, QuantRange, unsigned int>;
      index I(BP);
      I.build_index(G, Quant_Points, Quant_Points, BuildStats);
      G.save(index_output_path.data());
    }
  } else {
    Graph<unsigned int> G = Graph<unsigned int>(graph_degree, Points->size());
    using index = knn_index<PointRange<Point>, PointRange<Point>, unsigned int>;
    index I(BP);
    I.build_index(G, *Points, *Points, BuildStats);
    G.save(index_output_path.data());
  } 
}

template void build_vamana_index<float, Euclidian_Point<float>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                        float, bool);                            
template void build_vamana_index<float, Mips_Point<float>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                        float, bool);

template void build_vamana_index<int8_t, Euclidian_Point<int8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                         float, bool);
template void build_vamana_index<int8_t, Mips_Point<int8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                         float, bool);

template void build_vamana_index<uint8_t, Euclidian_Point<uint8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                          float, bool);
template void build_vamana_index<uint8_t, Mips_Point<uint8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                          float, bool);


template <typename T, typename Point>
void build_hcnng_index(std::string metric, std::string &vector_bin_path,
                         std::string &index_output_path, uint32_t mst_deg, uint32_t num_clusters,
                        uint32_t cluster_size)
{
    
    //instantiate build params object
    BuildParams BP(num_clusters, cluster_size, mst_deg);
    uint32_t graph_degree = BP.max_degree();

    //use file parsers to create Point object

    PointRange<Point> Points(vector_bin_path.data());
    //use max degree info to create Graph object
    Graph<unsigned int> G = Graph<unsigned int>(graph_degree, Points.size());

    //call the build function
    using index = hcnng_index<Point, PointRange<Point>, unsigned int>;
    index I;
    stats<unsigned int> BuildStats(G.size());
    I.build_index(G, Points, BP.num_clusters, BP.cluster_size, BP.MST_deg);

    //save the graph object
    G.save(index_output_path.data());

    
}

template void build_hcnng_index<float, Euclidian_Point<float>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                        uint32_t);                            
template void build_hcnng_index<float, Mips_Point<float>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                        uint32_t);

template void build_hcnng_index<int8_t, Euclidian_Point<int8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                         uint32_t);
template void build_hcnng_index<int8_t, Mips_Point<int8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                         uint32_t);

template void build_hcnng_index<uint8_t, Euclidian_Point<uint8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                          uint32_t);
template void build_hcnng_index<uint8_t, Mips_Point<uint8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                          uint32_t);


template <typename T, typename Point>
void build_pynndescent_index(std::string metric, std::string &vector_bin_path,
                         std::string &index_output_path, uint32_t max_deg, uint32_t num_clusters,
                        uint32_t cluster_size, double alpha, double delta)
{
    
    //instantiate build params object
    BuildParams BP(max_deg, alpha, num_clusters, cluster_size, delta);
    uint32_t graph_degree = BP.max_degree();

    //use file parsers to create Point object

    PointRange<Point> Points(vector_bin_path.data());
    //use max degree info to create Graph object
    Graph<unsigned int> G = Graph<unsigned int>(graph_degree, Points.size());

    //call the build function
    using index = pyNN_index<Point, PointRange<Point>, unsigned int>;
    index I(BP.R, BP.delta);
    stats<unsigned int> BuildStats(G.size());
    I.build_index(G, Points, BP.cluster_size, BP.num_clusters, BP.alpha);

    //save the graph object
    G.save(index_output_path.data());

    
}

template void build_pynndescent_index<float, Euclidian_Point<float>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                        uint32_t, double, double);                            
template void build_pynndescent_index<float, Mips_Point<float>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                        uint32_t, double, double);

template void build_pynndescent_index<int8_t, Euclidian_Point<int8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                         uint32_t, double, double);
template void build_pynndescent_index<int8_t, Mips_Point<int8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                         uint32_t, double, double);

template void build_pynndescent_index<uint8_t, Euclidian_Point<uint8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                          uint32_t, double, double);
template void build_pynndescent_index<uint8_t, Mips_Point<uint8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                          uint32_t, double, double);


template <typename T, typename Point>
void build_hnsw_index(std::string metric, std::string &vector_bin_path,
                         std::string &index_output_path, uint32_t graph_degree, uint32_t efc,
                        float m_l, float alpha)
{
    //instantiate build params object
    //BuildParams BP(graph_degree, efc, alpha);

    //use file parsers to create Point object
    PointRange<Point> Points(vector_bin_path.data());
    /*
    //use max degree info to create Graph object
    Graph<unsigned int> G = Graph<unsigned int>(graph_degree, Points.size());

    //call the build function
    using index = hnsw_index<Point, PointRange<T, Point>, unsigned int>;
    index I(BP);
    stats<unsigned int> BuildStats(G.size());
    I.build_index(G, Points, BuildStats);

    //save the graph object
    G.save(index_output_path.data());
    */
    using desc = Desc_HNSW<T, Point>;
    // using elem_t = typename desc::type_elem;

    // point_converter_default<elem_t> to_point;
    // auto [ps,dim] = load_point(vector_bin_path, to_point, cnt_points);
    auto ps = parlay::delayed_seq<Point>(
      Points.size(),
      [&](size_t i){return Points[i];}
    );
    const auto dim = Points.get_dims();
    auto G = ANN::HNSW<desc>(ps.begin(), ps.end(), dim, m_l, graph_degree, efc, alpha);
    G.save(index_output_path);
}

template void build_hnsw_index<float, Euclidian_Point<float>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                        float, float);
template void build_hnsw_index<float, Mips_Point<float>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                        float, float);

template void build_hnsw_index<int8_t, Euclidian_Point<int8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                         float, float);
template void build_hnsw_index<int8_t, Mips_Point<int8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                         float, float);

template void build_hnsw_index<uint8_t, Euclidian_Point<uint8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                          float, float);
template void build_hnsw_index<uint8_t, Mips_Point<uint8_t>>(std::string , std::string &, std::string &, uint32_t, uint32_t,
                                          float, float);


================================================
FILE: python/compile.sh
================================================
# g++ -DSTATS -DHOMEGROWN -pthread -mcx16 -O3 -Wall -shared -std=c++17 -march=native -DNDEBUG -I . -fPIC $(python3 -m pybind11 --includes) vamana_index.cpp -o vamana_index$(python3-config --extension-suffix) -DHOMEGROWN -pthread -ldl -L/usr/local/lib -ljemalloc 

g++ -DSTATS -DHOMEGROWN -pthread -mcx16 -O3 -shared -std=c++17 -march=native -DNDEBUG -I . -fPIC $(python -m pybind11 --includes) module.cpp -o _ParlayANNpy$(python -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))") -DHOMEGROWN -pthread -ldl -L/usr/local/lib #-ljemalloc 


================================================
FILE: python/defaults.py
================================================
# // This code is part of the Problem Based Benchmark Suite (PBBS)
# // Copyright (c) 2011 Guy Blelloch and the PBBS team
# //
# // Permission is hereby granted, free of charge, to any person obtaining a
# // copy of this software and associated documentation files (the
# // "Software"), to deal in the Software without restriction, including
# // without limitation the rights (to use, copy, modify, merge, publish,
# // distribute, sublicense, and/or sell copies of the Software, and to
# // permit persons to whom the Software is furnished to do so, subject to
# // the following conditions:
# //
# // The above copyright notice and this permission notice shall be included
# // in all copies or substantial portions of the Software.
# //
# // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

"""
# Parameter Defaults
These parameter defaults are re-exported from the C++ extension module, and used to keep the pythonic wrapper in sync with the C++.
"""
from ._ParlayANNpy import defaults as _defaults

ALPHA = _defaults.ALPHA
""" 
Note that, as ALPHA is a `float32` (single precision float) in C++, when converted into Python it becomes a 
`float64` (double precision float). The actual value is 1.2f. The alpha parameter (>=1) is used to control the nature 
and number of points that are added to the graph. A higher alpha value (e.g., 1.4) will result in fewer hops (and IOs) 
to convergence, but probably more distance comparisons compared to a lower alpha value.
"""
GRAPH_DEGREE = _defaults.GRAPH_DEGREE
""" 
Graph degree (a.k.a. `R`) is the maximum degree allowed for a node in the index's graph structure. This degree will be 
pruned throughout the course of the index build, but it will never grow beyond this value. Higher R values require 
longer index build times, but may result in an index showing excellent recall and latency characteristics. 
"""
BEAMWIDTH = _defaults.BEAMWIDTH
""" 
Complexity (a.k.a `L`) references the size of the list we store candidate approximate neighbors in while doing build
or search tasks. It's used during index build as part of the index optimization processes. It's used in index search 
classes both to help mitigate poor latencies during cold start, as well as on subsequent queries to conduct the search. 
Large values will likely increase latency but also may improve recall, and tuning these values for your particular 
index is certainly a reasonable choice.
"""

================================================
FILE: python/graph_index.cpp
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


#include "../algorithms/vamana/index.h"
#include "../algorithms/utils/types.h"
#include "../algorithms/utils/point_range.h"
#include "../algorithms/utils/graph.h"
#include "../algorithms/utils/euclidian_point.h"
#include "../algorithms/utils/mips_point.h"
#include "../algorithms/utils/jl_point.h"
#include "../algorithms/utils/stats.h"
#include "../algorithms/utils/beamSearch.h"
#include "../algorithms/HNSW/HNSW.hpp"
#include "pybind11/numpy.h"

#include "parlay/parallel.h"
#include "parlay/primitives.h"

#include <cstdio>
#include <utility>
#include <optional>

using namespace parlayANN;

namespace py = pybind11;
using NeighborsAndDistances = std::pair<py::array_t<unsigned int>, py::array_t<float>>;

template<typename T, typename Point>
struct GraphIndex{
  Graph<unsigned int> G;
  PointRange<Point> Points;

  // using EPoint = Euclidian_Point<uint16_t>;
  // using ERange = PointRange<EPoint>;
  // ERange E_Points;
  
  // euclidean quantized points
  using EQuantPoint = Euclidian_Point<uint8_t>;
  using EQuantRange = PointRange<EQuantPoint>;
  EQuantRange EQuant_Points;

  // euclidean low-quality quantized points
  using EQQuantPoint = Euclidean_JL_Sparse_Point<1024>;
  using EQQuantRange = PointRange<EQQuantPoint>;
  EQQuantRange EQQuant_Points;

  // mips or angular quantized points
  using MQuantT = int8_t;
  using MQuantPoint = Quantized_Mips_Point<8,true>;
  using MQuantRange = PointRange<MQuantPoint>;
  MQuantRange MQuant_Points;

  using MQQuantPoint = Mips_2Bit_Point;
  //using MQQuantPoint = Mips_JL_Sparse_Point<512>;
  using MQQuantRange = PointRange<MQQuantPoint>;
  MQQuantRange MQQuant_Points;
  
  bool use_quantization;

  std::optional<ANN::HNSW<Desc_HNSW<T, Point>>> HNSW_index;

  GraphIndex(std::string &data_path, std::string &index_path, bool is_hnsw=false)
    : use_quantization(false) {
    Points = PointRange<Point>(data_path.data());
    
    if (sizeof(T) > 1) {
      use_quantization = true;
      if (Point::is_metric()) {
        //E_Points = ERange(Points);
        EQuant_Points = EQuantRange(Points);
        if (Points.dimension() > 800)
          EQQuant_Points = EQQuantRange(Points);
      } else {
        for (int i=0; i < Points.size(); i++) 
          Points[i].normalize();
        MQuant_Points = MQuantRange(Points);
        // only double quantize for high dimensionality
        if (Points.dimension() > 200)
          MQQuant_Points = MQQuantRange(Points);
      }
    }

    if(is_hnsw) {
      HNSW_index = ANN::HNSW<Desc_HNSW<T, Point>>(
                                                  index_path,
                                                  [&](unsigned int i/*indexType*/){
                                                    return Points[i];
                                                  }
                                                  );
    }
    else {
      G = Graph<unsigned int>(index_path.data());
      if (G.size() != Points.size()) {
        std::cout << "graph size and point size do not match" << std::endl;
        abort();
      }
    }
  }

  auto search_dispatch(Point &q, QueryParams &QP, bool quant)
  {
    // if(HNSW_index) {
    //   using indexType = unsigned int; // be consistent with the type of G
    //   using std::pair;
    //   using seq_t = parlay::sequence<pair<indexType, typename Point::distanceType>>;

    //   indexType dist_cmps = 0;
    //   search_control ctrl{};
    //   if(QP.limit>0) {
    //     ctrl.limit_eval = QP.limit;
    //   }
    //   ctrl.count_cmps = &dist_cmps;

    //   seq_t frontier = HNSW_index->search(q, QP.k, QP.beamSize, ctrl);
    //   return pair(pair(std::move(frontier), seq_t{}), dist_cmps);
    // }
    //    else {
    using indexType = unsigned int;
    parlay::sequence<indexType> starts(1, 0);
    stats<indexType> Qstats(1);
    if (quant && use_quantization) {
      int dim = Points.params.dims;
      if (Point::is_metric()) {
        typename EQuantPoint::T buffer[dim];
        if (EQuant_Points.params.slope == 1) {
          for (int i=0; i < dim; i++)
            buffer[i] = q[i];
          EQuantPoint quant_q(buffer, 0, EQuant_Points.params);
          return beam_search(quant_q, G, EQuant_Points, starts, QP).first.first;
        } else {
          // uint8_t buffer_1[dim*2];
          // EPoint::translate_point(buffer_1, q, E_Points.params);
          // EPoint e_q(buffer_1, 0, E_Points.params);
          EQuantPoint::translate_point(buffer, q, EQuant_Points.params);
          EQuantPoint quant_q(buffer, 0, EQuant_Points.params);
          if (Points.dimension() > 800) {
            uint8_t buffer_2[dim];
            EQQuantPoint::translate_point(buffer_2, q, EQQuant_Points.params);
            EQQuantPoint quant_qq(buffer_2, 0, EQQuant_Points.params);
            return beam_search_rerank(q, quant_q, quant_qq, G,
                                      Points, EQuant_Points, EQQuant_Points,
                                      Qstats, starts, QP, false);
          } else // don't use second level quantization
            return beam_search_rerank(q, quant_q, quant_q, G,
                                      Points, EQuant_Points, EQuant_Points,
                                      Qstats, starts, QP, false);
        }
      } else {
        //typename MQuantPoint::T buffer[dim];
        uint8_t buffer[dim];
        q.normalize();
        MQuantPoint::translate_point(buffer, q, MQuant_Points.params);
        MQuantPoint quant_q(buffer, 0, MQuant_Points.params);
        if (Points.dimension() > 200) {
          uint8_t buffer_2[dim];
          MQQuantPoint::translate_point(buffer_2, q, MQQuant_Points.params);
          MQQuantPoint quant_qq(buffer_2, 0, MQQuant_Points.params);
          return beam_search_rerank(q, quant_q, quant_qq, G,
                                    Points, MQuant_Points, MQQuant_Points,
                                    Qstats, starts, QP, false);
        } else {
          return beam_search_rerank(q, quant_q, quant_q, G,
                                    Points, MQuant_Points, MQuant_Points,
                                    Qstats, starts, QP, false);
        }
      }
    } else {
      return beam_search(q, G, Points, starts, QP).first.first;
    }
  }

  NeighborsAndDistances batch_search(py::array_t<T, py::array::c_style | py::array::forcecast> &queries,
                                     //uint64_t num_queries_,
                                     uint64_t knn,
                                     uint64_t beam_width,
                                     bool quant = false,
                                     int64_t visit_limit = -1) {
    QueryParams QP(knn, beam_width, 1.35, visit_limit, std::min<int>(G.max_degree(), 3*visit_limit));

    uint64_t num_queries = queries.shape(0);
    py::array_t<unsigned int> ids({num_queries, knn});
    py::array_t<float> dists({num_queries, knn});

    parlay::parallel_for(0, num_queries, [&] (size_t i){
      std::vector<T> v(Points.dimension());
      for (int j=0; j < v.size(); j++)
        v[j] = queries.data(i)[j];
      Point q = Point((uint8_t*) v.data(), 0, Points.params);
      auto frontier = search_dispatch(q, QP, quant);
      for(int j=0; j<knn; j++){
        ids.mutable_data(i)[j] = frontier[j].first;
        dists.mutable_data(i)[j] = frontier[j].second;
      }
    });
    return std::make_pair(std::move(ids), std::move(dists));
  }

  py::array_t<unsigned int>
  single_search(py::array_t<T>& q, uint64_t knn,
                uint64_t beam_width, bool quant,
                int64_t visit_limit) {
    QueryParams QP(knn, beam_width, 1.35, visit_limit, std::min<int>(G.max_degree(), 3*visit_limit));
    int dims = Points.dimension();

    py::array_t<unsigned int> ids({(long) knn});
    auto pp = q.mutable_unchecked();
    T v[dims];
    for (int j=0; j < dims; j++)
      v[j] = pp(j); //q.data()[j];
    Point p = Point((uint8_t*) v, 0, Points.params);
    auto frontier = search_dispatch(p, QP, quant);
    for(int j=0; j<knn; j++) 
      ids.mutable_data()[j] = frontier[j].first;
    return std::move(ids);
  }

  NeighborsAndDistances batch_search_from_string(std::string &queries,
                                                 //uint64_t num_queries_,
                                                 uint64_t knn,
                                                 uint64_t beam_width, bool quant = false,
                                                 int64_t visit_limit = -1) {
    QueryParams QP(knn, beam_width, 1.35, visit_limit, std::min<int>(G.max_degree(), 3*visit_limit));
    PointRange<Point> QueryPoints(queries.data());
    uint64_t num_queries = QueryPoints.size();
    py::array_t<unsigned int> ids({num_queries, knn});
    py::array_t<float> dists({num_queries, knn});
    parlay::parallel_for(0, num_queries, [&] (size_t i){
      auto p = QueryPoints[i];
      auto frontier = search_dispatch(p, QP, quant);
      for(int j=0; j<knn; j++){
        ids.mutable_data(i)[j] = frontier[j].first;
        dists.mutable_data(i)[j] = frontier[j].second;
      }
    });

    return std::make_pair(std::move(ids), std::move(dists));
  }

  void check_recall(std::string &queries_file,
                    std::string &graph_file,
                    py::array_t<unsigned int, py::array::c_style | py::array::forcecast> &neighbors,
                    int k) {
    bool resolve_eq_distances = true;
    groundTruth<unsigned int> GT = groundTruth<unsigned int>(graph_file.data());
    PointRange<Point> QueryPoints(queries_file.data());

    size_t n = GT.size();
    long m = Points.size();

    float last_dist;
    
    int numCorrect = 0;
    for (unsigned int i = 0; i < n; i++) {
      parlay::sequence<int> results;
      int cnt = 0;
      for (unsigned int l = 0; l < k; l++)
        results.push_back(GT.coordinates(i,l));
      if (resolve_eq_distances) {
        last_dist = QueryPoints[i].distance(Points[GT.coordinates(i, k-1)]);
        for (unsigned int l = k; l < GT.dimension(); l++) {
          auto p = Points[GT.coordinates(i, l)];
          if (QueryPoints[i].distance(p) == last_dist) {
            cnt++;
            results.push_back(GT.coordinates(i,l));
          }
        }
      }
      std::set<int> reported_nbhs;
      for (unsigned int l = 0; l < k; l++) {
        long ngh = neighbors.mutable_data(i)[l];
        if (ngh < 0 || ngh >= m) {
          std::cout << "neighbor reported by query out of range: " << ngh << std::endl;
          std::abort();
        }
        reported_nbhs.insert(ngh);
      }
      for (unsigned int l = 0; l < results.size(); l++) {
        if (reported_nbhs.find(results[l]) != reported_nbhs.end()) {
          numCorrect += 1;
        }
      }
    }
    float recall = static_cast<double>(numCorrect) / static_cast<double>(k * n);
    std::cout << "Recall: " << std::setprecision(6) << recall <<std::endl;
  }

  // void check_recall(std::string &graph_file, py::array_t<unsigned int, py::array::c_style | py::array::forcecast> &neighbors, int k){
  //   groundTruth<unsigned int> GT = groundTruth<unsigned int>(graph_file.data());

  //   size_t n = GT.size();

  //   int numCorrect = 0;
  //   for (unsigned int i = 0; i < n; i++) {
  //     parlay::sequence<int> results_with_ties;
  //     for (unsigned int l = 0; l < k; l++)
  //       results_with_ties.push_back(GT.coordinates(i,l));
  //     std::cout << i << std::endl;
  //     float last_dist = GT.distances(i, k-1);
  //     for (unsigned int l = k; l < GT.dimension(); l++) {
  //       if (GT.distances(i,l) == last_dist) {
  //         results_with_ties.push_back(GT.coordinates(i,l));
  //       }
  //     }
  //     std::cout << "aa" << std::endl;
  //     std::set<int> reported_nbhs;
  //     for (unsigned int l = 0; l < k; l++) reported_nbhs.insert(neighbors.mutable_data(i)[l]);
  //     for (unsigned int l = 0; l < results_with_ties.size(); l++) {
  //       if (reported_nbhs.find(results_with_ties[l]) != reported_nbhs.end()) {
  //         numCorrect += 1;
  //       }
  //     }
  //   }
  //   float recall = static_cast<float>(numCorrect) / static_cast<float>(k * n);
  //   std::cout << "Recall: " << recall << std::endl;
  // }

};


================================================
FILE: python/module.cpp
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include <string>

#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include "pybind11/numpy.h"

#include "builder.cpp"
#include "graph_index.cpp"

using namespace parlayANN;

PYBIND11_MAKE_OPAQUE(std::vector<uint32_t>);
PYBIND11_MAKE_OPAQUE(std::vector<float>);
PYBIND11_MAKE_OPAQUE(std::vector<int8_t>);
PYBIND11_MAKE_OPAQUE(std::vector<uint8_t>);

namespace py = pybind11;
using namespace pybind11::literals;

// using NeighborsAndDistances = std::pair<py::array_t<unsigned int, py::array::c_style | py::array::forcecast>, py::array_t<float, py::array::c_style | py::array::forcecast>>;

struct Variant
{
    std::string builder_name;
    std::string index_name;
};

const Variant FloatEuclidianVariant{"build_vamana_float_euclidian_index", "FloatEuclidianIndex"};
const Variant FloatMipsVariant{"build_vamana_float_mips_index", "FloatMipsIndex"};

const Variant UInt8EuclidianVariant{"build_vamana_uint8_euclidian_index", "UInt8EuclidianIndex"};
const Variant UInt8MipsVariant{"build_vamana_uint8_mips_index", "UInt8MipsIndex"};

const Variant Int8EuclidianVariant{"build_vamana_int8_euclidian_index", "Int8EuclidianIndex"};
const Variant Int8MipsVariant{"build_vamana_int8_mips_index", "Int8MipsIndex"};

template <typename T, typename Point> inline void add_variant(py::module_ &m, const Variant &variant)
{

    m.def(variant.builder_name.c_str(), build_vamana_index<T, Point>, "distance_metric"_a,
          "data_file_path"_a, "index_output_path"_a, "graph_degree"_a, "beam_width"_a, "alpha"_a, "two_pass"_a);

    py::class_<GraphIndex<T, Point>>(m, variant.index_name.c_str())
      .def(py::init<std::string &, std::string &, bool>(),
           "index_path"_a, "data_path"_a, "hnsw"_a=false)
      //do we want to add options like visited limit, or leave those as defaults?
      .def("batch_search", &GraphIndex<T, Point>::batch_search, "queries"_a, "knn"_a,
           "beam_width"_a, "quant"_a, "visit_limit"_a)
      .def("single_search", &GraphIndex<T, Point>::single_search, "q"_a, "knn"_a,
           "beam_width"_a, "quant"_a, "visit_limit"_a)
      .def("batch_search_from_string", &GraphIndex<T, Point>::batch_search_from_string, "queries"_a, "knn"_a,
           "beam_width"_a, "quant"_a, "visit_limit"_a)
      .def("check_recall", &GraphIndex<T, Point>::check_recall, "queries_file"_a, "graph_file"_a, "neighbors"_a, "k"_a);
}

const Variant FloatEuclidianHCNNGVariant{"build_hcnng_float_euclidian_index", "FloatEuclidianIndex"};
const Variant FloatMipsHCNNGVariant{"build_hcnng_float_mips_index", "FloatMipsIndex"};

const Variant UInt8EuclidianHCNNGVariant{"build_hcnng_uint8_euclidian_index", "UInt8EuclidianIndex"};
const Variant UInt8MipsHCNNGVariant{"build_hcnng_uint8_mips_index", "UInt8MipsIndex"};

const Variant Int8EuclidianHCNNGVariant{"build_hcnng_int8_euclidian_index", "Int8EuclidianIndex"};
const Variant Int8MipsHCNNGVariant{"build_hcnng_int8_mips_index", "Int8MipsIndex"};

template <typename T, typename Point> inline void add_hcnng_variant(py::module_ &m, const Variant &variant)
{

    m.def(variant.builder_name.c_str(), build_hcnng_index<T, Point>, "distance_metric"_a,
          "data_file_path"_a, "index_output_path"_a, "mst_deg"_a, "num_clusters"_a, "cluster_size"_a);

   
}

const Variant FloatEuclidianpyNNVariant{"build_pynndescent_float_euclidian_index", "FloatEuclidianIndex"};
const Variant FloatMipspyNNVariant{"build_pynndescent_float_mips_index", "FloatMipsIndex"};

const Variant UInt8EuclidianpyNNVariant{"build_pynndescent_uint8_euclidian_index", "UInt8EuclidianIndex"};
const Variant UInt8MipspyNNVariant{"build_pynndescent_uint8_mips_index", "UInt8MipsIndex"};

const Variant Int8EuclidianpyNNVariant{"build_pynndescent_int8_euclidian_index", "Int8EuclidianIndex"};
const Variant Int8MipspyNNVariant{"build_pynndescent_int8_mips_index", "Int8MipsIndex"};

template <typename T, typename Point> inline void add_pynndescent_variant(py::module_ &m, const Variant &variant)
{

    m.def(variant.builder_name.c_str(), build_pynndescent_index<T, Point>, "distance_metric"_a,
          "data_file_path"_a, "index_output_path"_a, "max_deg"_a, "num_clusters"_a, "cluster_size"_a, 
          "alpha"_a, "delta"_a);

   
}

const Variant FloatEuclidianHNSWVariant{"build_hnsw_float_euclidian_index", "FloatEuclidianIndex"};
const Variant FloatMipsHNSWVariant{"build_hnsw_float_mips_index", "FloatMipsIndex"};

const Variant UInt8EuclidianHNSWVariant{"build_hnsw_uint8_euclidian_index", "UInt8EuclidianIndex"};
const Variant UInt8MipsHNSWVariant{"build_hnsw_uint8_mips_index", "UInt8MipsIndex"};

const Variant Int8EuclidianHNSWVariant{"build_hnsw_int8_euclidian_index", "Int8EuclidianIndex"};
const Variant Int8MipsHNSWVariant{"build_hnsw_int8_mips_index", "Int8MipsIndex"};

template <typename T, typename Point> inline void add_hnsw_variant(py::module_ &m, const Variant &variant)
{

    m.def(variant.builder_name.c_str(), build_hnsw_index<T, Point>, "distance_metric"_a,
          "data_file_path"_a, "index_output_path"_a, "graph_degree"_a, "efc"_a, "m_l"_a, "alpha"_a);
}


PYBIND11_MODULE(_ParlayANNpy, m)
{
    m.doc() = "ParlayANN Python Bindings";
#ifdef VERSION_INFO
    m.attr("__version__") = VERSION_INFO;
#else
    m.attr("__version__") = "dev";
#endif

    // let's re-export our defaults
    py::module_ default_values = m.def_submodule(
        "defaults");

    default_values.attr("METRIC") = "Euclidian";
    default_values.attr("ALPHA") = 1.2;
    default_values.attr("GRAPH_DEGREE") = 64;
    default_values.attr("BEAMWIDTH") = 128;

    add_variant<float, Euclidian_Point<float>>(m, FloatEuclidianVariant);
    add_variant<float, Mips_Point<float>>(m, FloatMipsVariant);
    add_variant<uint8_t, Euclidian_Point<uint8_t>>(m, UInt8EuclidianVariant);
    add_variant<uint8_t, Mips_Point<uint8_t>>(m, UInt8MipsVariant);
    add_variant<int8_t, Euclidian_Point<int8_t>>(m, Int8EuclidianVariant);
    add_variant<int8_t, Mips_Point<int8_t>>(m, Int8MipsVariant);

    add_hcnng_variant<float, Euclidian_Point<float>>(m, FloatEuclidianHCNNGVariant);
    add_hcnng_variant<float, Mips_Point<float>>(m, FloatMipsHCNNGVariant);
    add_hcnng_variant<uint8_t, Euclidian_Point<uint8_t>>(m, UInt8EuclidianHCNNGVariant);
    add_hcnng_variant<uint8_t, Mips_Point<uint8_t>>(m, UInt8MipsHCNNGVariant);
    add_hcnng_variant<int8_t, Euclidian_Point<int8_t>>(m, Int8EuclidianHCNNGVariant);
    add_hcnng_variant<int8_t, Mips_Point<int8_t>>(m, Int8MipsHCNNGVariant);

    add_pynndescent_variant<float, Euclidian_Point<float>>(m, FloatEuclidianpyNNVariant);
    add_pynndescent_variant<float, Mips_Point<float>>(m, FloatMipspyNNVariant);
    add_pynndescent_variant<uint8_t, Euclidian_Point<uint8_t>>(m, UInt8EuclidianpyNNVariant);
    add_pynndescent_variant<uint8_t, Mips_Point<uint8_t>>(m, UInt8MipspyNNVariant);
    add_pynndescent_variant<int8_t, Euclidian_Point<int8_t>>(m, Int8EuclidianpyNNVariant);
    add_pynndescent_variant<int8_t, Mips_Point<int8_t>>(m, Int8MipspyNNVariant);

    add_hnsw_variant<float, Euclidian_Point<float>>(m, FloatEuclidianHNSWVariant);
    add_hnsw_variant<float, Mips_Point<float>>(m, FloatMipsHNSWVariant);
    add_hnsw_variant<uint8_t, Euclidian_Point<uint8_t>>(m, UInt8EuclidianHNSWVariant);
    add_hnsw_variant<uint8_t, Mips_Point<uint8_t>>(m, UInt8MipsHNSWVariant);
    add_hnsw_variant<int8_t, Euclidian_Point<int8_t>>(m, Int8EuclidianHNSWVariant);
    add_hnsw_variant<int8_t, Mips_Point<int8_t>>(m, Int8MipsHNSWVariant);
}


================================================
FILE: python/scripts/fashion_test.py
================================================
import _ParlayANNpy as pann
import wrapper as wp
import time

NAME = "fashion-mnist-784-euclidean"
DATA_DIR = "data/" + NAME + "/"
metric = "Euclidian"

wp.build_vamana_index(metric, "float", DATA_DIR + NAME + "_base.fbin", DATA_DIR + "outputs/" + NAME, 32, 64, 1.15, True)

Index = wp.load_index(metric, "float", DATA_DIR + NAME + "_base.fbin", DATA_DIR + "outputs/" + NAME)

for (Q,limit) in [(10,10), (10,13), (10,16)] :
    for x in range(5) :
        start = time.time()
        neighbors, distances = Index.batch_search_from_string(DATA_DIR + NAME + "_query.fbin", 10, Q, True, limit)
        end = time.time()
        print("QPS: ", neighbors.shape[0]/(end - start))
        
    Index.check_recall(DATA_DIR + NAME + "_query.fbin", DATA_DIR + NAME + "_groundtruth", neighbors, 10)


================================================
FILE: python/scripts/gist_test.py
================================================
import _ParlayANNpy as pann
import wrapper as wp
import time

NAME = "gist"
DATA_DIR = "data/" + NAME + "/"
metric = "Euclidian"

# wp.build_vamana_index(metric, "float", DATA_DIR + NAME + "_base.fbin", DATA_DIR + "outputs/" + NAME, 100, 200, 1.1, True)

Index = wp.load_index(metric, "float", DATA_DIR + NAME + "_base.fbin", DATA_DIR + "outputs/" + NAME)

for Q in [19, 36, 65] :
    for x in range(5) :
        start = time.time()
        neighbors, distances = Index.batch_search_from_string(DATA_DIR + NAME + "_query.fbin", 10, Q, True, 1000)
        end = time.time()
        print("QPS: ", neighbors.shape[0]/(end - start))
        
    Index.check_recall(DATA_DIR + NAME + "_query.fbin", DATA_DIR + NAME + "-1M", neighbors, 10)


================================================
FILE: python/scripts/glove100_test.py
================================================
import _ParlayANNpy as pann
import wrapper as wp
import time

NAME = "glove-100-angular"
DATA_DIR = "data/" + NAME + "/"
metric = "mips"

# wp.build_vamana_index(metric, "float", DATA_DIR + NAME + "_base.fbin", DATA_DIR + "outputs/" + NAME, 150, 300, 1, True)

Index = wp.load_index(metric, "float", DATA_DIR + NAME + "_base.fbin", DATA_DIR + "outputs/" + NAME)

for Q in [30, 80, 160] :
    for x in range(5) :
        start = time.time()
        neighbors, distances = Index.batch_search_from_string(DATA_DIR + NAME + "_query.fbin", 10, Q, True, 1000)
        end = time.time()
        print("QPS: ", neighbors.shape[0]/(end - start))
        
    Index.check_recall(DATA_DIR + NAME + "_query.fbin", DATA_DIR + NAME + "_groundtruth", neighbors, 10)


================================================
FILE: python/scripts/glove25_test.py
================================================
import _ParlayANNpy as pann
import wrapper as wp
import time

NAME = "glove-25-angular"
DATA_DIR = "data/" + NAME + "/"
metric = "mips"

# wp.build_vamana_index(metric, "float", DATA_DIR + NAME + "_base.fbin", DATA_DIR + "outputs/" + NAME, 150, 300, 1, True)

Index = wp.load_index(metric, "float", DATA_DIR + NAME + "_base.fbin", DATA_DIR + "outputs/" + NAME)

for Q in [11, 20, 32] :
    for x in range(5) :
        start = time.time()
        neighbors, distances = Index.batch_search_from_string(DATA_DIR + NAME + "_query.fbin", 10, Q, True, 1000)
        end = time.time()
        print("QPS: ", neighbors.shape[0]/(end - start))
        
    Index.check_recall(DATA_DIR + NAME + "_query.fbin", DATA_DIR + NAME + "_groundtruth", neighbors, 10)


================================================
FILE: python/scripts/nyt_test.py
================================================
import _ParlayANNpy as pann
import wrapper as wp
import time

NAME = "nytimes-256-angular"
DATA_DIR = "data/" + NAME + "/"
metric = "mips"

# wp.build_vamana_index(metric, "float", DATA_DIR + NAME + "_base.fbin", DATA_DIR + "outputs/" + NAME, 150, 300, .85, True)

Index = wp.load_index(metric, "float", DATA_DIR + NAME + "_base.fbin", DATA_DIR + "outputs/" + NAME)

for Q in [17, 60] :
    for x in range(5) :
        start = time.time()
        neighbors, distances = Index.batch_search_from_string(DATA_DIR + NAME + "_query.fbin", 10, Q, True, 1000)
        end = time.time()
        print("QPS: ", neighbors.shape[0]/(end - start))
        
    Index.check_recall(DATA_DIR + NAME + "_query.fbin", DATA_DIR + NAME + "_groundtruth", neighbors, 10)


================================================
FILE: python/scripts/sift_test.py
================================================
import _ParlayANNpy as pann
import wrapper as wp
import time

NAME = "sift-128-euclidean"
DATA_DIR = "data/" + NAME + "/"
metric = "Euclidian"

# wp.build_vamana_index(metric, "float", DATA_DIR + NAME + "_base.fbin", DATA_DIR + "outputs/" + NAME + "_64", 64, 128, 1.15, True)
# wp.build_vamana_index(metric, "float", DATA_DIR + NAME + "_base.fbin", DATA_DIR + "outputs/" + NAME + "_40", 40, 80, 1.15, True)
# wp.build_vamana_index(metric, "float", DATA_DIR + NAME + "_base.fbin", DATA_DIR + "outputs/" + NAME + "_32", 32, 64, 1.15, True)

Index = wp.load_index(metric, "float", DATA_DIR + NAME + "_base.fbin", DATA_DIR + "outputs/" + NAME + "_64")

for Q in [14, 23] :
    for x in range(5) :
        start = time.time()
        neighbors, distances = Index.batch_search_from_string(DATA_DIR + NAME + "_query.fbin", 10, Q, True, 1000)
        end = time.time()
        print(neighbors.size)
        print("QPS: ", f'{neighbors.shape[0]/(end - start):.6f})
        
    Index.check_recall(DATA_DIR + NAME + "_query.fbin", DATA_DIR + NAME + "_groundtruth", neighbors, 10)


================================================
FILE: python/sift_test.py
================================================
import _ParlayANNpy as pann
import wrapper as wp
import time

NAME = "sift-128-euclidean"
DATA_DIR = "data/" + NAME + "/"
metric = "Euclidian"

# wp.build_vamana_index(metric, "float", DATA_DIR + "base.fbin", DATA_DIR + "graphs/graph_" + "64_1.1x", 64, 128, 1.1, True)

Index = wp.load_index(metric, "float", DATA_DIR + "base.fbin", DATA_DIR + "graphs/graph_" + "64_1.1x")

for Q in [16, 25] :
    for x in range(3) :
        start = time.time()
        neighbors, distances = Index.batch_search_from_string(DATA_DIR + "query.fbin", 10, Q, True, 1000)
        end = time.time()
        print("QPS: ", neighbors.shape[0]/(end - start))
        
    Index.check_recall(DATA_DIR + "query.fbin", DATA_DIR + "groundtruth", neighbors, 10)


================================================
FILE: python/test.py
================================================
import wrapper as wp


FERN_DATA_DIR = "/ssd1/anndata/bigann/"
AWARE_DATA_DIR = "/ssd1/data/bigann/"

DATA_DIR = FERN_DATA_DIR

print("Testing pynndescent...")

wp.build_pynndescent_index("Euclidian", "uint8", DATA_DIR + "base.1B.u8bin.crop_nb_1000000", DATA_DIR + "outputs/pynn", 40, 10, 100, 1.2, .05)

Index = wp.load_index("Euclidian", "uint8", DATA_DIR + "base.1B.u8bin.crop_nb_1000000", DATA_DIR + "outputs/pynn")
neighbors, distances = Index.batch_search_from_string(DATA_DIR + "query.public.10K.u8bin", 10, 10, True, 10000)

Index.check_recall(DATA_DIR + "query.public.10K.u8bin", DATA_DIR + "bigann-1M", neighbors, 10)

print("\nTesting vamana...")

wp.build_vamana_index("Euclidian", "uint8", DATA_DIR + "base.1B.u8bin.crop_nb_1000000", DATA_DIR + "outputs/vamana", 40, 100, 1.2, False)

Index = wp.load_index("Euclidian", "uint8", DATA_DIR + "base.1B.u8bin.crop_nb_1000000", DATA_DIR + "outputs/vamana")
neighbors, distances = Index.batch_search_from_string(DATA_DIR + "query.public.10K.u8bin", 10, 10, True, 10000)

Index.check_recall(DATA_DIR + "query.public.10K.u8bin", DATA_DIR + "bigann-1M", neighbors, 10)

print("\nTesting hcnng...")

wp.build_hcnng_index("Euclidian", "uint8", DATA_DIR + "base.1B.u8bin.crop_nb_1000000", DATA_DIR + "outputs/hcnng", 40, 20, 1000)

Index = wp.load_index("Euclidian", "uint8", DATA_DIR + "base.1B.u8bin.crop_nb_1000000", DATA_DIR + "outputs/hcnng")
neighbors, distances = Index.batch_search_from_string(DATA_DIR + "query.public.10K.u8bin", 10, 10, True, 10000)

Index.check_recall(DATA_DIR + "query.public.10K.u8bin", DATA_DIR + "bigann-1M", neighbors, 10)

# HNSW is currently broken, as far as I can tell

================================================
FILE: python/wrapper.py
================================================
from _ParlayANNpy import *

def build_vamana_index(metric, dtype, data_dir, index_dir, R, L, alpha, two_pass):
    if metric == 'Euclidian':
        if dtype == 'uint8':
            return build_vamana_uint8_euclidian_index(metric, data_dir, index_dir, R, L, alpha, two_pass)
        elif dtype == 'int8':
            return build_vamana_int8_euclidian_index(metric, data_dir, index_dir, R, L, alpha, two_pass)
        elif dtype == 'float':
            return build_vamana_float_euclidian_index(metric, data_dir, index_dir, R, L, alpha, two_pass)
        else:
            raise Exception('Invalid data type ' + dtype)
    elif metric == 'mips':
        if dtype == 'uint8':
            return build_vamana_uint8_mips_index(metric, data_dir, index_dir, R, L, alpha, two_pass)
        elif dtype == 'int8':
            return build_vamana_int8_mips_index(metric, data_dir, index_dir, R, L, alpha, two_pass)
        elif dtype == 'float':
            return build_vamana_float_mips_index(metric, data_dir, index_dir, R, L, alpha, two_pass)
        else:
            raise Exception('Invalid data type ' + dtype)
    else:
        raise Exception('Invalid metric ' + metric)
    

def build_hcnng_index(metric, dtype, data_dir, index_dir, mst_deg, num_clusters, cluster_size):
    if metric == 'Euclidian':
        if dtype == 'uint8':
            build_hcnng_uint8_euclidian_index(metric, data_dir, index_dir, mst_deg, num_clusters, cluster_size)
        elif dtype == 'int8':
            build_hcnng_int8_euclidian_index(metric, data_dir, index_dir, mst_deg, num_clusters, cluster_size)
        elif dtype == 'float':
            build_hcnng_float_euclidian_index(metric, data_dir, index_dir, mst_deg, num_clusters, cluster_size)
        else:
            raise Exception('Invalid data type ' + dtype)
    elif metric == 'mips':
        if dtype == 'uint8':
            build_hcnng_uint8_mips_index(metric, data_dir, index_dir, mst_deg, num_clusters, cluster_size)
        elif dtype == 'int8':
            build_hcnng_int8_mips_index(metric, data_dir, index_dir, mst_deg, num_clusters, cluster_size)
        elif dtype == 'float':
            build_hcnng_float_mips_index(metric, data_dir, index_dir, mst_deg, num_clusters, cluster_size)
        else:
            raise Exception('Invalid data type ' + dtype)
    else:
        raise Exception('Invalid metric ' + metric)


def build_pynndescent_index(metric, dtype, data_dir, index_dir, max_deg, num_clusters, cluster_size, alpha, delta):
    if metric == 'Euclidian':
        if dtype == 'uint8':
            build_pynndescent_uint8_euclidian_index(metric, data_dir, index_dir, max_deg, num_clusters, cluster_size, alpha, delta)
        elif dtype == 'int8':
            build_pynndescent_int8_euclidian_index(metric, data_dir, index_dir, max_deg, num_clusters, cluster_size, alpha, delta)
        elif dtype == 'float':
            build_pynndescent_float_euclidian_index(metric, data_dir, index_dir, max_deg, num_clusters, cluster_size, alpha, delta)
        else:
            raise Exception('Invalid data type ' + dtype)
    elif metric == 'mips':
        if dtype == 'uint8':
            build_pynndescent_uint8_mips_index(metric, data_dir, index_dir, max_deg, num_clusters, cluster_size, alpha, delta)
        elif dtype == 'int8':
            build_pynndescent_int8_mips_index(metric, data_dir, index_dir, max_deg, num_clusters, cluster_size, alpha, delta)
        elif dtype == 'float':
            build_pynndescent_float_mips_index(metric, data_dir, index_dir, max_deg, num_clusters, cluster_size, alpha, delta)
        else:
            raise Exception('Invalid data type ' + dtype)
    else:
        raise Exception('Invalid metric ' + metric)


def build_hnsw_index(metric, dtype, data_dir, index_dir, R, efc, m_l, alpha):
    if metric == 'Euclidian':
        if dtype == 'uint8':
            build_hnsw_uint8_euclidian_index(metric, data_dir, index_dir, R, efc, m_l, alpha)
        elif dtype == 'int8':
            build_hnsw_int8_euclidian_index(metric, data_dir, index_dir, R, efc, m_l, alpha)
        elif dtype == 'float':
            build_hnsw_float_euclidian_index(metric, data_dir, index_dir, R, efc, m_l, alpha)
        else:
            raise Exception('Invalid data type ' + dtype)
    elif metric == 'mips':
        if dtype == 'uint8':
            build_hnsw_uint8_mips_index(metric, data_dir, index_dir, R, efc, m_l, alpha)
        elif dtype == 'int8':
            build_hnsw_int8_mips_index(metric, data_dir, index_dir, R, efc, m_l, alpha)
        elif dtype == 'float':
            build_hnsw_float_mips_index(metric, data_dir, index_dir, R, efc, m_l, alpha)
        else:
            raise Exception('Invalid data type ' + dtype)
    else:
        raise Exception('Invalid metric ' + metric)

        
def load_index(metric, dtype, data_dir, index_dir, hnsw=False):
    if metric == 'Euclidian':
        if dtype == 'uint8':
            return UInt8EuclidianIndex(data_dir, index_dir, hnsw)
        elif dtype == 'int8':
            return Int8EuclidianIndex(data_dir, index_dir, hnsw)
        elif dtype == 'float':
            return FloatEuclidianIndex(data_dir, index_dir, hnsw)
        else:
            raise Exception('Invalid data type')
    elif metric == 'mips':
        if dtype == 'uint8':
            return UInt8MipsIndex(data_dir, index_dir, hnsw)
        elif dtype == 'int8':
            return Int8MipsIndex(data_dir, index_dir, hnsw)
        elif dtype == 'float':
            return FloatMipsIndex(data_dir, index_dir, hnsw)
        else:
            raise Exception('Invalid data type')
    else:
        raise Exception('Invalid metric')


================================================
FILE: rangeSearch/bench/.gitignore
================================================
neighborsCheck
neighborsCheck.o


================================================
FILE: rangeSearch/bench/IO.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#pragma once

#include <iostream>
#include <fstream>
#include <string>
#include <string>
#include <cstring>
#include "parlay/primitives.h"
#include "parlay/parallel.h"
#include "parlay/io.h"
#include "parlay/internal/get_time.h"

namespace benchIO {
  using namespace std;
  using parlay::sequence;
  using parlay::tabulate;
  using parlay::make_slice;

  auto is_space = [] (char c) {
    switch (c)  {
    case '\r': 
    case '\t': 
    case '\n': 
    case 0:
    case ' ' : return true;
    default : return false;
    }
  };

  // parallel code for converting a string to word pointers
  // side effects string by setting to null after each word
  template <class Seq>
    parlay::sequence<char*> stringToWords(Seq &Str) {
    size_t n = Str.size();
    
    parlay::parallel_for(0, n, [&] (long i) {
	if (is_space(Str[i])) Str[i] = 0;}); 

    // mark start of words
    auto FL = parlay::tabulate(n, [&] (long i) -> bool {
	return (i==0) ? Str[0] : Str[i] && !Str[i-1];});
    
    // offset for each start of word
    auto Offsets = parlay::pack_index<long>(FL);

    // pointer to each start of word
    auto SA = parlay::tabulate(Offsets.size(), [&] (long j) -> char* {
	return Str.begin() + Offsets[j];});
    
    return SA;
  }

  //using this as a typename so we can replace with parlay::chars easily if desired
  using charstring = typename parlay::sequence<char>;

  inline int xToStringLen(charstring const &a) { return a.size();}
  inline void xToString(char* s, charstring const &a) {
    for (int i=0; i < a.size(); i++) s[i] = a[i];}

  inline int xToStringLen(long a) { return 21;}
  inline void xToString(char* s, long a) { sprintf(s,"%ld",a);}

  inline int xToStringLen(unsigned long a) { return 21;}
  inline void xToString(char* s, unsigned long a) { sprintf(s,"%lu",a);}

  inline uint xToStringLen(uint a) { return 12;}
  inline void xToString(char* s, uint a) { sprintf(s,"%u",a);}

  inline int xToStringLen(int a) { return 12;}
  inline void xToString(char* s, int a) { sprintf(s,"%d",a);}

  inline int xToStringLen(double a) { return 18;}
  inline void xToString(char* s, double a) { sprintf(s,"%.11le", a);}

  inline int xToStringLen(char* a) { return strlen(a)+1;}
  inline void xToString(char* s, char* a) { sprintf(s,"%s",a);}

  template <class A, class B>
  inline int xToStringLen(pair<A,B> a) { 
    return xToStringLen(a.first) + xToStringLen(a.second) + 1;
  }

  template <class A, class B>
  inline void xToString(char* s, pair<A,B> a) { 
    int l = xToStringLen(a.first);
    xToString(s, a.first);
    s[l] = ' ';
    xToString(s+l+1, a.second);
  }

  template <class Seq>
  charstring seqToString(Seq const &A) {
    size_t n = A.size();
    auto L = parlay::tabulate(n, [&] (size_t i) -> long {
	typename Seq::value_type x = A[i];
	return xToStringLen(x)+1;});
    size_t m;
    std::tie(L,m) = parlay::scan(std::move(L));

    charstring B(m+1, (char) 0);
    char* Bs = B.begin();

    parlay::parallel_for(0, n-1, [&] (long i) {
      xToString(Bs + L[i], A[i]);
      Bs[L[i+1] - 1] = '\n';
      });
    xToString(Bs + L[n-1], A[n-1]);
    Bs[m] = Bs[m-1] = '\n';
    
    charstring C = parlay::filter(B, [&] (char c) {return c != 0;}); 
    C[C.size()-1] = 0;
    return C;
  }

  template <class T>
  void writeSeqToStream(ofstream& os, parlay::sequence<T> const &A) {
    size_t bsize = 10000000;
    size_t offset = 0;
    size_t n = A.size();
    while (offset < n) {
      // Generates a string for a sequence of size at most bsize
      // and then wrties it to the output stream
      charstring S = seqToString(A.cut(offset, min(offset + bsize, n)));
      os.write(S.begin(), S.size()-1);
      offset += bsize;
    }
  }

  template <class T>
  int writeSeqToFile(string header,
		     parlay::sequence<T> const &A,
		     char const *fileName) {
    auto a = A[0];
    //xToStringLena(a);
    ofstream file (fileName, ios::out | ios::binary);
    if (!file.is_open()) {
      std::cout << "Unable to open file: " << fileName << std::endl;
      return 1;
    }
    file << header << endl;
    writeSeqToStream(file, A);
    file.close();
    return 0;
  }

  template <class T1, class T2>
  int write2SeqToFile(string header,
		      parlay::sequence<T1> const &A,
		      parlay::sequence<T2> const &B,
		      char const *fileName) {
    ofstream file (fileName, ios::out | ios::binary);
    if (!file.is_open()) {
      std::cout << "Unable to open file: " << fileName << std::endl;
      return 1;
    }
    file << header << endl;
    writeSeqToStream(file, A);
    writeSeqToStream(file, B);
    file.close();
    return 0;
  }

  charstring readStringFromFile(char const *fileName) {
    ifstream file (fileName, ios::in | ios::binary | ios::ate);
    if (!file.is_open()) {
      std::cout << "Unable to open file: " << fileName << std::endl;
      abort();
    }
    long end = file.tellg();
    file.seekg (0, ios::beg);
    long n = end - file.tellg();
    charstring bytes(n, (char) 0);
    file.read (bytes.begin(), n);
    file.close();
    return bytes;
  }

  string intHeaderIO = "sequenceInt";

  template <class T>
  int writeIntSeqToFile(parlay::sequence<T> const &A, char const *fileName) {
    return writeSeqToFile(intHeaderIO, A, fileName);
  }

  sequence<sequence<char>> get_tokens(char const *fileName) {
    // parlay::internal::timer t("get_tokens");
    // auto S = parlay::chars_from_file(fileName);
    auto S = parlay::file_map(fileName);
    // t.next("file map");
    auto r =  parlay::tokens(S, benchIO::is_space);
    // t.next("tokens");
    return r;
  }

  template <class T>
  parlay::sequence<T> readIntSeqFromFile(char const *fileName) {
    auto W = get_tokens(fileName);
    string header(W[0].begin(),W[0].end());
    if (header != intHeaderIO) {
      cout << "readIntSeqFromFile: bad input" << endl;
      abort();
    }
    long n = W.size()-1;
    auto A = parlay::tabulate(n, [&] (long i) -> T {
	return parlay::chars_to_long(W[i+1]);});
    return A;
  }
};


================================================
FILE: rangeSearch/bench/MakeBench
================================================
# ********************
# GENERIC MAKEFILE FOR MOST BENCHMARKS THAT #include <name>.h
# USES FOLLOWING DEFINITIONS
#    BENCH : the name of the benchmark
#    REQUIRE : dependences
#    CC : the compiler
#    CFLAGS : compiler flags
#    LFLAGS : compiler link flags
# ********************

TIME = ../bench/$(BENCH)Time.C
INCLUDE = 

all : $(BENCH) 

$(BENCH) : $(TIME) $(BENCH).h $(REQUIRE)
	$(CC) -DSTATS $(CFLAGS) $(INCLUDE) -include $(BENCH).h -o $(BENCH) $(TIME) $(LFLAGS)

clean :
	rm -f $(BENCH)

cleanall : clean
	rm -f testInputs*; cd ../bench; make -s clean


================================================
FILE: rangeSearch/bench/Makefile
================================================
include parallelDefsANN
BNCHMRK = range

CHECKFILES = $(BNCHMRK)Check.o

COMMON =

INCLUDE = -Icommon

%.o : %.C $(COMMON)
	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@

# $(BNCHMRK)Check : $(CHECKFILES)
# 	$(CC) $(LFLAGS) -o $@ $(CHECKFILES)

clean :
	rm -f $(BNCHMRK)Check *.o *.pyc


================================================
FILE: rangeSearch/bench/get_time.h
================================================
#pragma once

#include <stdlib.h>
#include <sys/time.h>
#include <iomanip>
#include <iostream>
#include <string>

namespace cpam {

struct timer {
  double total_time;
  double last_time;
  bool on;
  std::string name;
  struct timezone tzp;

  timer(std::string name = "PBBS time", bool _start = true)
  : total_time(0.0), on(false), name(name), tzp({0,0}) {
    if (_start) start();
  }

  double get_time() {
    timeval now;
    gettimeofday(&now, &tzp);
    return ((double) now.tv_sec) + ((double) now.tv_usec)/1000000.;
  }

  void start () {
    on = 1;
    last_time = get_time();
  }

  double stop () {
    on = 0;
    double d = (get_time()-last_time);
    total_time += d;
    return d;
  }

  void reset() {
     total_time=0.0;
     on=0;
  }

  double get_total() {
    if (on) return total_time + get_time() - last_time;
    else return total_time;
  }

  double get_next() {
    if (!on) return 0.0;
    double t = get_time();
    double td = t - last_time;
    total_time += td;
    last_time = t;
    return td;
  }

  void report(double time, std::string str) {
    std::ios::fmtflags cout_settings = std::cout.flags();
    std::cout.precision(4);
    std::cout << std::fixed;
    std::cout << name << ": ";
    if (str.length() > 0)
      std::cout << str << ": ";
    std::cout << time << std::endl;
    std::cout.flags(cout_settings);
  }

  void total() {
    report(get_total(),"total");
    total_time = 0.0;
  }

  void reportTotal(std::string str) {
    report(get_total(), str);
  }

  void next(std::string str) {
    if (on) report(get_next(), str);
  }
};

}  // namespace cpam


================================================
FILE: rangeSearch/bench/parallelDefsANN
================================================
ifeq (, $(shell which jemalloc-config))
JEMALLOC =
else
JEMALLOCLD = $(shell jemalloc-config --libdir)
JEMALLOC = -L$(JEMALLOCLD) -ljemalloc 
endif

CCFLAGS = -mcx16 -O3 -std=c++17 -march=native -DNDEBUG -I .
CLFLAGS = -ldl $(JEMALLOC) 

OMPFLAGS = -DPARLAY_OPENMP -fopenmp
CILKFLAGS = -DPARLAY_CILK -fcilkplus
PBBFLAGS = -DHOMEGROWN -pthread

ifdef OPENMP
CC = g++
CFLAGS = $(OMPFLAGS) $(CCFLAGS)
LFLAGS = $(OMPFLAGS) $(CLFLAGS)

else ifdef CILK
CC = g++
CFLAGS = $(CILKFLAGS) $(CCFLAGS)
LFLAGS = $(CILKFLAGS) $(CLFLAGS)

else
CC = g++
CFLAGS = $(PBBFLAGS) $(CCFLAGS)
LFLAGS = $(PBBFLAGS) $(CLFLAGS)
endif


================================================
FILE: rangeSearch/bench/rangeTime.C
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include <iostream>
#include <algorithm>
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "../../algorithms/bench/parse_command_line.h"
#include "../../algorithms/bench/time_loop.h"
#include "../utils/NSGDist.h"
#include "../utils/euclidian_point.h"
#include "../utils/point_range.h"
#include "../utils/mips_point.h"
#include "../utils/graph.h"

#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

using namespace parlayANN;

// *************************************************************
//  TIMING
// *************************************************************

using uint = unsigned int;

template<typename Point, typename PointRange, typename indexType>
void timeRange(Graph<indexType> &G,
               PointRange &Query_Points, long k,
               BuildParams &BP, char* outFile,
               RangeGroundTruth<indexType> GT, char* res_file, bool graph_built,
               PointRange &Points)
{
  RNG<Point, PointRange, indexType>(G, BP, Query_Points, GT, res_file,
                                    graph_built, Points);
  if(outFile != NULL) G.save(outFile);
}

int main(int argc, char* argv[]) {
    commandLine P(argc,argv,
    "[-a <alpha>] [-d <delta>] [-R <deg>]"
        "[-L <bm>] [-k <k> ]  [-gt_path <g>] [-query_path <qF>]"
        "[-graph_path <gF>] [-graph_outfile <oF>] [-res_path <rF>]" "[-num_passes <np>]"
        "[-memory_flag <algoOpt>] [-mst_deg <q>] [-num_clusters <nc>] [-cluster_size <cs>]"
        "[-data_type <tp>] [-dist_func <df>] [-base_path <b>] <inFile>");

  char* iFile = P.getOptionValue("-base_path");
  char* oFile = P.getOptionValue("-graph_outfile");
  char* gFile = P.getOptionValue("-graph_path");
  char* qFile = P.getOptionValue("-query_path");
  char* cFile = P.getOptionValue("-gt_path");
  char* rFile = P.getOptionValue("-res_path");
  char* vectype = P.getOptionValue("-data_type");
  long Q = P.getOptionIntValue("-Q", 0);
  long R = P.getOptionIntValue("-R", 0);
  if(R<0) P.badArgument();
  long L = P.getOptionIntValue("-L", 0);
  if(L<0) P.badArgument();
  long MST_deg = P.getOptionIntValue("-mst_deg", 0);
  if(MST_deg < 0) P.badArgument();
  long num_clusters = P.getOptionIntValue("-num_clusters", 0);
  if(num_clusters<0) P.badArgument();
  long cluster_size = P.getOptionIntValue("-cluster_size", 0);
  if(cluster_size<0) P.badArgument();
  double radius  = P.getOptionDoubleValue("-radius", 0.0);
  long k = P.getOptionIntValue("-k", 0);
  if (k > 1000 || k < 0) P.badArgument();
  double alpha = P.getOptionDoubleValue("-alpha", 1.0);
  int num_passes = P.getOptionIntValue("-num_passes", 1);
  int two_pass = P.getOptionIntValue("-two_pass", 0);
  if(two_pass > 1 | two_pass < 0) P.badArgument();
  if (two_pass == 1) num_passes = 2;
  double delta = P.getOptionDoubleValue("-delta", 0);
  if(delta<0) P.badArgument();
  char* dfc = P.getOptionValue("-dist_func");
  int quantize = P.getOptionIntValue("-quantize_bits", 0);
  int quantize_build = P.getOptionIntValue("-quantize_mode", 0);
  bool verbose = P.getOption("-verbose");
  bool normalize = P.getOption("-normalize");
  double trim = P.getOptionDoubleValue("-trim", 0.0); // not used
  bool self = P.getOption("-self");
  int rerank_factor = P.getOptionIntValue("-rerank_factor", 100);
  bool range = P.getOption("-range");
  bool is_early_stop = P.getOption("-early_stop");
  char* sm = P.getOptionValue("-search_mode");
  double esr = P.getOptionDoubleValue("-early_stopping_radius", 0);
  double rad  = P.getOptionDoubleValue("-r", 0.0);
  double batch_factor = P.getOptionDoubleValue("-batch_factor", .125);
    
  // this integer represents the number of random edges to start with for
  // inserting in a single batch per round
  int single_batch = P.getOptionIntValue("-single_batch", 0);
    
  std::string df = std::string(dfc);
  std::string tp = std::string(vectype);

  std::string searchType = std::string(sm);
  rangeQueryType rtype = Beam;

  if (searchType == "doubling") {
    rtype = Doubling;
    std::cout << "Using doubling range search" << std::endl;
  } else if (searchType == "greedy") {
    rtype = Greedy;
    std::cout << "Using greedy range search" << std::endl;
  }
  else if (searchType == "beam") {
    rtype = Beam;
    std::cout << "Using beam range search" << std::endl;
  }
  else rtype = None;

  BuildParams BP = BuildParams(R, L, alpha, num_passes, num_clusters, cluster_size, MST_deg, delta,
                               verbose, quantize_build,
                               self, single_batch,
                               Q, trim,
                               rerank_factor, batch_factor,
                               is_early_stop, esr,
                               rtype, rad);
  long maxDeg = BP.max_degree();

  if((tp != "uint8") && (tp != "int8") && (tp != "float")){
    std::cout << "Error: vector type not specified correctly, specify int8, uint8, or float" << std::endl;
    abort();
  }

  if(df != "Euclidian" && df != "mips"){
    std::cout << "Error: specify distance type Euclidian or mips" << std::endl;
    abort();
  }

  bool graph_built = (gFile != NULL);

  RangeGroundTruth<uint> GT = RangeGroundTruth<uint>(cFile);
  
  if(tp == "float"){
    if(df == "Euclidian"){
      PointRange<Euclidian_Point<float>> Points(iFile);
      PointRange<Euclidian_Point<float>> Query_Points(qFile);
      if (normalize) {
        std::cout << "normalizing data" << std::endl;
        for (int i=0; i < Points.size(); i++) 
          Points[i].normalize();
        for (int i=0; i < Query_Points.size(); i++) 
          Query_Points[i].normalize();
      }
      Graph<unsigned int> G; 
      if(gFile == NULL) G = Graph<unsigned int>(maxDeg, Points.size());
      else G = Graph<unsigned int>(gFile);
      if (quantize == 8) {
        std::cout << "quantizing data to 1 byte" << std::endl;
        using QT = uint8_t;
        using QPoint = Euclidian_Point<QT>;
        using PR = PointRange<QPoint>;
        PR Points_(Points);
        PR Query_Points_(Query_Points, Points_.params);
        timeRange<QPoint, PR, uint>(G, Query_Points_, k, BP, oFile, GT, rFile, graph_built, Points_);
      } else if (quantize == 16) {
        std::cout << "quantizing data to 2 bytes" << std::endl;
        using Point = Euclidian_Point<uint16_t>;
        using PR = PointRange<Point>;
        PR Points_(Points);
        PR Query_Points_(Query_Points, Points_.params);
        timeRange<Point, PR, uint>(G, Query_Points_, k, BP, oFile, GT, rFile, graph_built, Points_);
      } else {
        using Point = Euclidian_Point<float>;
        using PR = PointRange<Point>;
        timeRange<Point, PR, uint>(G, Query_Points, k, BP, oFile, GT, rFile, graph_built, Points);
      }
    } else if(df == "mips"){
      PointRange<Mips_Point<float>> Points(iFile);
      PointRange<Mips_Point<float>> Query_Points(qFile);
      if (normalize) {
        std::cout << "normalizing data" << std::endl;
        for (int i=0; i < Points.size(); i++) 
          Points[i].normalize();
        for (int i=0; i < Query_Points.size(); i++) 
          Query_Points[i].normalize();
      }
      Graph<unsigned int> G; 
      if(gFile == NULL) G = Graph<unsigned int>(maxDeg, Points.size());
      else G = Graph<unsigned int>(gFile);
      if (quantize == 8) {
        std::cout << "quantizing data to 1 byte" << std::endl;
        using QT = int8_t;
        using Point = Quantized_Mips_Point<8>;
        using PR = PointRange<Point>;
        PR Points_(Points);
        PR Query_Points_(Query_Points, Points_.params);
        timeRange<Point, PR, uint>(G, Query_Points_, k, BP, oFile, GT, rFile, graph_built, Points_);
      } else if (quantize == 16) {
        std::cout << "quantizing data to 2 bytes" << std::endl;
        using QT = int16_t;
        using Point = Quantized_Mips_Point<16>;
        using PR = PointRange<Point>;
        PR Points_(Points);
        PR Query_Points_(Query_Points, Points_.params);
        timeRange<Point, PR, uint>(G, Query_Points_, k, BP, oFile, GT, rFile, graph_built, Points_);
      } else {
        using Point = Mips_Point<float>;
        using PR = PointRange<Point>;
        timeRange<Point, PR, uint>(G, Query_Points, k, BP, oFile, GT, rFile, graph_built, Points);
      }
    }
  } else if(tp == "uint8"){
    if(df == "Euclidian"){
      PointRange<Euclidian_Point<uint8_t>> Points(iFile);
      PointRange<Euclidian_Point<uint8_t>> Query_Points(qFile);
      Graph<unsigned int> G; 
      if(gFile == NULL) G = Graph<unsigned int>(maxDeg, Points.size());
      else G = Graph<unsigned int>(gFile);
      timeRange<Euclidian_Point<uint8_t>, PointRange<Euclidian_Point<uint8_t>>, uint>(G, Query_Points, k, BP, 
        oFile, GT, rFile, graph_built, Points);
    } else if(df == "mips"){
      PointRange<Mips_Point<uint8_t>> Points(iFile);
      PointRange<Mips_Point<uint8_t>> Query_Points(qFile);
      Graph<unsigned int> G; 
      if(gFile == NULL) G = Graph<unsigned int>(maxDeg, Points.size());
      else G = Graph<unsigned int>(gFile);
      timeRange<Mips_Point<uint8_t>, PointRange<Mips_Point<uint8_t>>, uint>(G, Query_Points, k, BP, 
        oFile, GT, rFile, graph_built, Points);
    }
  } else if(tp == "int8"){
    if(df == "Euclidian"){
      PointRange<Euclidian_Point<int8_t>> Points(iFile);
      PointRange<Euclidian_Point<int8_t>> Query_Points(qFile);
      Graph<unsigned int> G; 
      if(gFile == NULL) G = Graph<unsigned int>(maxDeg, Points.size());
      else G = Graph<unsigned int>(gFile);
      timeRange<Euclidian_Point<int8_t>, PointRange<Euclidian_Point<int8_t>>, uint>(G, Query_Points, k, BP,
        oFile, GT, rFile, graph_built, Points);
    } else if(df == "mips"){
      PointRange<Mips_Point<int8_t>> Points(iFile);
      PointRange<Mips_Point<int8_t>> Query_Points(qFile);
      Graph<unsigned int> G; 
      if(gFile == NULL) G = Graph<unsigned int>(maxDeg, Points.size());
      else G = Graph<unsigned int>(gFile);
      timeRange<Mips_Point<int8_t>, PointRange<Mips_Point<int8_t>>, uint>(G, Query_Points, k, BP,
        oFile, GT, rFile, graph_built, Points);
    }
  }
  
  return 0;
}


================================================
FILE: rangeSearch/vamanaRange/Makefile
================================================
include ../bench/parallelDefsANN

REQUIRE = ../utils/beamSearch.h ../utils/doublingSearch.h ../../algorithms/vamana/index.h  ../utils/check_nn_recall.h ../utils/NSGDist.h ../utils/parse_results.h ../utils/graph.h ../utils/point_range.h ../utils/check_range_recall.h ../utils/earlyStopping.h ../utils/rangeSearch.h ../utils/types.h ../utils/stats.h
BENCH = range

include ../bench/MakeBench


================================================
FILE: rangeSearch/vamanaRange/range.h
================================================
// This code is part of the Problem Based Benchmark Suite (PBBS)
// Copyright (c) 2011 Guy Blelloch and the PBBS team
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights (to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

#include <algorithm>

#include "../utils/NSGDist.h"
#include "../utils/check_range_recall.h"
#include "../utils/beamSearch.h"
#include "../utils/check_nn_recall.h"
#include "../utils/parse_results.h"
#include "../utils/stats.h"
#include "../utils/types.h"
#include "../utils/graph.h"
#include "../utils/mips_point.h"
#include "../utils/euclidian_point.h"
#include "../../algorithms/vamana/index.h"
#include "parlay/parallel.h"
#include "parlay/primitives.h"
#include "parlay/random.h"

namespace parlayANN{

template<typename Point, typename PointRange_,  typename indexType>
void RNG(Graph<indexType> &G, BuildParams &BP,
         PointRange_ &Query_Points,
         RangeGroundTruth<indexType> GT,
         char* res_file, bool graph_built, PointRange_ &Points) {
  parlay::internal::timer t("ANN");
  using findex = knn_index<PointRange_, PointRange_, indexType>;
  findex I(BP);
  double idx_time;
  indexType start_point;

  stats<unsigned int> BuildStats(G.size());
  if(graph_built){
    idx_time = 0;
    start_point = 1;
  } else{
    I.build_index(G, Points, Points, BuildStats);
    start_point = 1; //  I.get_start();
    idx_time = t.next_time();
  }

  
  std::string name = "Vamana";
  std::string params =
      "R = " + std::to_string(BP.R) + ", L = " + std::to_string(BP.L);
  auto [avg_deg, max_deg] = graph_stats_(G);
  auto vv = BuildStats.visited_stats();
  std::cout << "Average visited: " << vv[0] << ", Tail visited: " << vv[1]
            << std::endl;
  Graph_ G_(name, params, G.size(), avg_deg, max_deg, idx_time);
  G_.print();
  double esr = BP.early_stopping_radius;
  double rad = BP.radius;
  if(Query_Points.size() != 0) {
    if (BP.quantize != 0) {
      std::cout << "quantizing build and first pass of search to 1 byte" << std::endl;
      if (Point::is_metric()) {
        using QT = uint8_t;
        using QPoint = Euclidian_Point<QT>;
        using QPR = PointRange<QPoint>;
        QPR Q_Points(Points);  // quantized to one byte
        QPR Q_Query_Points(Query_Points, Q_Points.params);
        range_search_wrapper<Point>(G,
                                    Points, Query_Points,
                                    Q_Points, Q_Query_Points,
                                    GT, start_point,
                                    BP.is_early_stop, esr, BP.range_query_type, rad);
      } else {
        using QPoint = Quantized_Mips_Point<8,true,255>;
        using QPR = PointRange<QPoint>;
        QPR Q_Points(Points);
        QPR Q_Query_Points(Query_Points, Q_Points.params);
        range_search_wrapper<Point>(G,
                                    Points, Query_Points,
                                    Q_Points, Q_Query_Points,
                                    GT, start_point,
                                    BP.is_early_stop, esr, BP.range_query_type, rad);
      }
    } else {
      range_search_wrapper<Point>(G,
                                  Points, Query_Points,
                                  Points, Query_Points,
                                  GT, start_point,
                                  BP.is_early_stop, esr, BP.range_query_type, rad);
    }
  }
}
}