[
  {
    "path": ".github/FUNDING.yml",
    "content": "# These are supported funding model platforms\n\ngithub: spacejam # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]\npatreon: # Replace with a single Patreon username\nopen_collective: # Replace with a single Open Collective username\nko_fi: # Replace with a single Ko-fi username\ntidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel\ncommunity_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry\nliberapay: # Replace with a single Liberapay username\nissuehunt: # Replace with a single IssueHunt username\notechie: # Replace with a single Otechie username\ncustom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/blank_issue.md",
    "content": "---\nname: Blank Issue (do not use this for bug reports or feature requests)\nabout: Create an issue with a blank template.\n---\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/bugs.md",
    "content": "---\nname: Bug Report\nabout: Report a correctness issue or violated expectation\nlabels: bug\n---\n\nBug reports must include all following items:\n\n1. expected result\n1. actual result\n1. sled version\n1. rustc version\n1. operating system\n1. minimal code sample that helps to reproduce the issue\n1. logs, panic messages, stack traces\n\nIncomplete bug reports will be closed.\n\nDo not open bug reports for documentation issues. Please just open a PR with the proposed documentation change.\n\nThank you for understanding :)\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "content": "blank_issues_enabled: true\ncontact_links:\n  - name: sled discord\n    url: https://discord.gg/Z6VsXds\n    about: Please ask questions in the discord server here.\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature_request.md",
    "content": "---\nname: Feature Request\nabout: Request a feature for sled\nlabels: feature\n---\n\n#### Use Case:\n\n#### Proposed Change:\n\n#### Who Benefits From The Change(s)?\n\n#### Alternative Approaches\n"
  },
  {
    "path": ".github/dependabot.yml",
    "content": "version: 2\nupdates:\n- package-ecosystem: cargo\n  directory: \"/\"\n  schedule:\n    interval: daily\n    time: \"10:00\"\n  open-pull-requests-limit: 10\n  ignore:\n  - dependency-name: crdts\n    versions:\n    - \">= 2.a, < 3\"\n  - dependency-name: zerocopy\n    versions:\n    - 0.4.0\n"
  },
  {
    "path": ".github/workflows/test.yml",
    "content": "name: Rust\n\non:\n  pull_request:\n    branches:\n    - main\n\njobs:\n  clippy_check:\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v1\n      - uses: actions-rs/toolchain@v1\n        with:\n            toolchain: nightly\n            components: clippy\n            override: true\n      - run: rustup component add clippy\n      - uses: actions-rs/clippy-check@v1\n        with:\n          token: ${{ secrets.GITHUB_TOKEN }}\n          args: --all-features\n  default:\n    name: Cargo Test on ${{ matrix.os }}\n    env:\n      RUST_BACKTRACE: 1\n    runs-on: ${{ matrix.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        os: [ubuntu-latest, macos-latest, windows-latest]\n    steps:\n    - uses: actions/checkout@v1\n    - name: Cache target\n      uses: actions/cache@v2\n      env:\n        cache-name: cache-default-target-and-lockfile\n      with:\n        path: |\n          target\n          Cargo.lock\n          ~/.rustup\n        key: ${{ runner.os }}-${{ env.cache-name }}-${{ hashFiles('**/Cargo.toml') }}\n    - name: linux coredump setup\n      if: ${{ runner.os == 'linux' }}\n      run: |\n        ulimit -c unlimited\n        echo \"$PWD/core-dumps/corefile-%e-%p-%t\" | sudo tee /proc/sys/kernel/core_pattern\n        mkdir core-dumps\n    - name: cargo test\n      run: |\n        rustup update --no-self-update\n        cargo test --release --no-default-features --features=for-internal-testing-only -- --nocapture\n    - uses: actions/upload-artifact@v4\n      if: ${{ failure() && runner.os == 'linux' }}\n      with:\n        name: linux-core-dumps\n        path: |\n          ./core-dumps/*\n          ./target/release/deps/test_*\n  examples:\n    name: Example Tests\n    runs-on: ubuntu-latest\n    steps:\n    - uses: actions/checkout@v1\n    - name: Cache target\n      uses: actions/cache@v2\n      env:\n        cache-name: cache-examples-target-and-lockfile\n      with:\n        path: |\n          target\n          Cargo.lock\n          ~/.rustup\n        key: ${{ runner.os }}-${{ env.cache-name }}-${{ hashFiles('**/Cargo.toml') }}\n    - name: example tests\n      run: |\n        rustup update --no-self-update\n        cargo run --example playground\n        cargo run --example structured\n  cross-compile:\n    name: Cross Compile\n    runs-on: macos-latest\n    steps:\n    - uses: actions/checkout@v1\n    - name: cross compile\n      run: |\n        set -eo pipefail\n        echo \"cross build\"\n        scripts/cross_compile.sh\n  burn-in:\n    name: Burn In\n    env:\n      RUST_BACKTRACE: 1\n    runs-on: ubuntu-latest\n    steps:\n    - uses: actions/checkout@v1\n    - name: Cache target\n      uses: actions/cache@v2\n      env:\n        cache-name: cache-stress2-asan-target-and-lockfile\n      with:\n        path: |\n          benchmarks/stress2/target\n          benchmarks/stress2/Cargo.lock\n          ~/.rustup\n        key: ${{ runner.os }}-${{ env.cache-name }}-${{ hashFiles('**/Cargo.toml') }}\n    - name: burn in\n      run: |\n        set -eo pipefail\n        pushd benchmarks/stress2\n        ulimit -c unlimited\n        echo \"$PWD/core-dumps/corefile-%e-%p-%t\" | sudo tee /proc/sys/kernel/core_pattern\n        mkdir core-dumps\n        rustup toolchain install nightly\n        rustup toolchain install nightly --component rust-src\n        rustup update\n        rm -rf default.sled || true\n        export RUSTFLAGS=\"-Z sanitizer=address\"\n        export ASAN_OPTIONS=\"detect_odr_violation=0\"\n        cargo +nightly build --release --target x86_64-unknown-linux-gnu\n        target/x86_64-unknown-linux-gnu/release/stress2 --duration=240\n        rm -rf default.sled\n    - name: print backtraces with gdb\n      if: ${{ failure() }}\n      run: |\n        sudo apt-get update\n        sudo apt-get install gdb\n        pushd benchmarks/stress2\n        echo \"first backtrace:\"\n        gdb target/release/stress2 core-dumps/* -batch -ex 'bt -frame-info source-and-location'\n        echo \"\"\n        echo \"\"\n        echo \"\"\n        echo \"all backtraces:\"\n        gdb target/release/stress2 core-dumps/* -batch -ex 't a a bt -frame-info source-and-location'\n    - uses: actions/upload-artifact@v4\n      if: ${{ failure() }}\n      with:\n        name: linux-core-dumps\n        path: |\n          ./benchmarks/stress2/core-dumps/*\n          ./benchmarks/stress2/target/release/stress2\n  sanitizers:\n    name: Sanitizers\n    env:\n      RUST_BACKTRACE: 1\n    runs-on: ubuntu-latest\n    steps:\n    - uses: actions/checkout@v1\n    - name: Cache rustup\n      uses: actions/cache@v2\n      env:\n        cache-name: cache-sanitizers-target-and-lockfile\n      with:\n        path: |\n          ~/.rustup\n          benchmarks/stress2/target\n          benchmarks/stress2/Cargo.lock\n        key: ${{ runner.os }}-${{ env.cache-name }}-${{ hashFiles('**/Cargo.toml') }}\n    - name: sanitizers\n      run: |\n        set -eo pipefail\n        scripts/sanitizers.sh\n"
  },
  {
    "path": ".gitignore",
    "content": "CLAUDE.md\nfuzz-*.log\ndefault.sled\ntiming_test*\n*db\ncrash_test_files\n*conf\n*snap.*\n*grind.out*\nvgcore*\n*.bk\n*orig\ntags\nperf*\n*folded\n*out\n*perf\n*svg\n*txt\nexperiments\ntarget\nCargo.lock\n*swp\n*swo\n*.proptest-regressions\ncorpus\nartifacts\n.idea\ncargo-timing*\n"
  },
  {
    "path": ".rustfmt.toml",
    "content": "version = \"Two\"\nuse_small_heuristics = \"Max\"\nreorder_imports = true\nmax_width = 80\nwrap_comments = true\ncombine_control_expr = true\nreport_todo = \"Always\"\n"
  },
  {
    "path": "ARCHITECTURE.md",
    "content": "<table style=\"width:100%\">\n<tr>\n  <td>\n    <table style=\"width:100%\">\n      <tr>\n        <td> key </td>\n        <td> value </td>\n      </tr>\n      <tr>\n        <td><a href=\"https://github.com/sponsors/spacejam\">buy a coffee for us to convert into databases</a></td>\n        <td><a href=\"https://github.com/sponsors/spacejam\"><img src=\"https://img.shields.io/github/sponsors/spacejam\"></a></td>\n      </tr>\n      <tr>\n        <td><a href=\"https://docs.rs/sled\">documentation</a></td>\n        <td><a href=\"https://docs.rs/sled\"><img src=\"https://docs.rs/sled/badge.svg\"></a></td>\n      </tr>\n      <tr>\n        <td><a href=\"https://discord.gg/Z6VsXds\">chat about databases with us</a></td>\n        <td><a href=\"https://discord.gg/Z6VsXds\"><img src=\"https://img.shields.io/discord/509773073294295082.svg?logo=discord\"></a></td>\n      </tr>\n     </table>\n  </td>\n  <td>\n<p align=\"center\">\n  <img src=\"https://raw.githubusercontent.com/spacejam/sled/main/art/tree_face_anti-transphobia.png\" width=\"40%\" height=\"auto\" />\n  </p>\n  </td>\n </tr>\n</table>\n\n# sled 1.0 architecture\n\n## in-memory\n\n* Lock-free B+ tree index, extracted into the [`concurrent-map`](https://github.com/komora-io/concurrent-map) crate.\n* The lowest key from each leaf is stored in this in-memory index.\n* To read any leaf that is not already cached in memory, at most one disk read will be required.\n* RwLock-backed leaves, using the ArcRwLock from the [`parking_lot`](https://github.com/Amanieu/parking_lot) crate. As a `Db` grows, leaf contention tends to go down in most use cases. But this may be revisited over time if many users have issues with RwLock-related contention. Avoiding full RCU for updates on the leaves results in many of the performance benefits over sled 0.34, with significantly lower memory pressure.\n* A simple but very high performance epoch-based reclamation technique is used for safely deferring frees of in-memory index data and reuse of on-disk heap slots, extracted into the [`ebr`](https://github.com/komora-io/ebr) crate.\n* A scan-resistant LRU is used for handling eviction. By default, 20% of the cache is reserved for leaves that are accessed at most once. This is configurable via `Config.entry_cache_percent`. This is handled by the extracted [`cache-advisor`](https://github.com/komora-io/cache-advisor) crate. The overall cache size is set by the `Config.cache_size` configurable.\n\n## write path\n\n* This is where things get interesting. There is no traditional WAL. There is no LSM. Only metadata is logged atomically after objects are written in parallel.\n* The important guarantees are:\n  * all previous writes are durable after a call to `Db::flush` (This is also called periodically in the background by a flusher thread)\n  * all write batches written using `Db::apply_batch` are either 100% visible or 0% visible after crash recovery. If it was followed by a flush that returned `Ok(())` it is guaranteed to be present.\n* Atomic ([linearizable](https://jepsen.io/consistency/models/linearizable)) durability is provided by marking dirty leaves as participants in \"flush epochs\" and performing atomic batch writes of the full epoch at a time, in order. Each call to `Db::flush` advances the current flush epoch by 1.\n* The atomic write consists in the following steps:\n  1. User code or the background flusher thread calls `Db::flush`.\n  1. In parallel (via [rayon](https://docs.rs/rayon)) serialize and compress each dirty leaf with zstd (configurable via `Config.zstd_compression_level`).\n  1. Based on the size of the bytes for each object, choose the smallest heap file slot that can hold the full set of bytes. This is an on-disk slab allocator.\n  1. Slab slots are not power-of-two sized, but tend to increase in size by around 20% from one to the next, resulting in far lower fragmentation than typical page-oriented heaps with either constant-size or power-of-two sized leaves.\n  1. Write the object to the allocated slot from the rayon threadpool.\n  1. After all writes, fsync the heap files that were written to.\n  1. If any writes were written to the end of the heap file, causing it to grow, fsync the directory that stores all heap files.\n  1. After the writes are stable, it is now safe to write an atomic metadata batch that records the location of each written leaf in the heap. This is a simple framed batch of `(low_key, slab_slot)` tuples that are initially written to a log, but eventually merged into a simple snapshot file for the metadata store once the log becomes larger than the snapshot file.\n  1. Fsync of the metadata log file.\n  1. Fsync of the metadata log directory.\n  1. After the atomic metadata batch write, the previously occupied slab slots are marked for future reuse with the epoch-based reclamation system. After all threads that may have witnessed the previous location have finished their work, the slab slot is added to the free `BinaryHeap` of the slot that it belongs to so that it may be reused in future atomic write batches.\n  1. Return `Ok(())` to the caller of `Db::flush`.\n* Writing objects before the metadata write is random, but modern SSDs handle this well. Even though the SSD's FTL will be working harder to defragment things periodically than if we wrote a few megabytes sequentially with each write, the data that the FTL will be copying will be mostly live due to the eager leaf write-backs.\n\n## recovery\n\n* Recovery involves simply reading the atomic metadata store that records the low key for each written leaf as well as its location and mapping it into the in-memory index. Any gaps in the slabs are then used as free slots.\n* Any write that failed to complete its entire atomic writebatch is treated as if it never happened, because no user-visible flush ever returned successfully.\n* Rayon is also used here for parallelizing reads of this metadata. In general, this is extremely fast compared to the previous sled recovery process.\n\n## tuning\n\n* The larger the `LEAF_FANOUT` const generic on the high-level `Db` struct (default `1024`), the smaller the in-memory leaf index and the better the compression ratio of the on-disk file, but the more expensive it will be to read the entire leaf off of disk and decompress it.\n* You can choose to turn the `LEAF_FANOUT` relatively low to make the system behave more like an Index+Log architecture, but overall disk size will grow and write performance will decrease.\n* NB: changing `LEAF_FANOUT` after writing data is not supported.\n"
  },
  {
    "path": "CHANGELOG.md",
    "content": "# Unreleased\n\n## New Features\n\n* #1178 batches and transactions are now unified for subscribers.\n* #1231 `Tree::get_zero_copy` allows for reading a value directly\n  in-place without making an `IVec` first.\n* #1250 the global `print_profile` function has been added\n  which is enabled when compiling with the `metrics` feature.\n* #1254 `IVec` data will now always have an alignment of 8,\n  which may enable interesting architecture-specific use cases.\n* #1307 & #1315 `Db::contains_tree` can be used to see if a\n  `Tree` with a given name already exists.\n\n## Improvements\n\n* #1214 a new slab-style storage engine has been added which\n  replaces the previous file-per-blob technique for storing\n  large pages.\n* #1231 tree nodes now get merged into a single-allocation\n  representation that is able to dynamically avoid various\n  overheads, resulting in significant efficiency improvements.\n\n## Breaking Changes\n\n* #1400 Bump MSRV to 1.57.\n* #1399 Thread support is now required on all platforms.\n* #1135 The \"no_metrics\" anti-feature has been replaced with\n  the \"metrics\" positive feature.\n* #1178 the `Event` enum has become a unified struct that allows\n  subscribers to iterate over each (Tree, key, optional value)\n  involved in single key operations, batches, or transactions in\n  a unified way.\n* #1178 the `Event::key` method has been removed in favor of the\n  new more comprehensive `iter` method.\n* #1214 The deprecated `Config::build` method has been removed.\n* #1248 The deprecated `Tree::set` method has been removed.\n* #1248 The deprecated `Tree::del` method has been removed.\n* #1250 The `Config::print_profile_on_drop` method has been\n  removed in favor of the global `print_profile` function.\n* #1252 The deprecated `Db::open` method has been removed.\n* #1252 The deprecated `Config::segment_cleanup_skew` method\n  has been removed.\n* #1252 The deprecated `Config::segment_cleanup_threshold`\n  method has been removed.\n* #1252 The deprecated `Config::snapshot_path` method has\n  been removed.\n* #1253 The `IVec::subslice` method has been removed.\n* #1275 Keys and values are now limited to 128gb on 64-bit\n  platforms and 512mb on 32-bit platforms.\n* #1281 `Config`'s `cache_capacity` is now a usize, as u64\n  doesn't make sense for things that must fit in memory anyway.\n* #1314 `Subscriber::next_timeout` now requires a mutable self\n  reference.\n* #1349 The \"measure_allocs\" feature has been removed.\n* #1354 `Error` has been modified to be Copy, removing all\n  heap-allocated variants.\n\n## Bug Fixes\n\n* #1202 Fix a space leak where blobs were not\n  removed when replaced by another blob.\n* #1229 the powerful ALICE crash consistency tool has been\n  used to discover several crash vulnerabilities, now fixed.\n\n# 0.34.7\n\n## Bug Fixes\n\n* #1314 Fix a bug in Subscriber's Future impl.\n\n# 0.34.6\n\n## Improvements\n\n* documentation improved\n\n# 0.34.5\n\n## Improvements\n\n* #1164 widen some trait bounds on trees and batches\n\n# 0.34.4\n\n## New Features\n\n* #1151 `Send` is implemented for `Iter`\n* #1167 added `Tree::first` and `Tree::last` functions\n  to retrieve the first or last items in a `Tree`, unless\n  the `Tree` is empty.\n\n## Bug Fixes\n\n* #1159 dropping a `Db` instance will no-longer\n  prematurely shut-down the background flusher\n  thread.\n* #1168 fixed an issue that was causing panics during\n  recovery in 32-bit code.\n* #1170 when encountering corrupted storage data,\n  the recovery process will panic less often.\n\n# 0.34.3\n\n## New Features\n\n* #1146 added `TransactionalTree::generate_id`\n\n# 0.34.2\n\n## Improvements\n\n* #1133 transactions and writebatch performance has been\n  significantly improved by removing a bottleneck in\n  the atomic batch stability tracking code.\n\n# 0.34.1\n\n## New Features\n\n* #1136 Added the `TransactionalTree::flush` method to\n  flush the underlying database after the transaction\n  commits and before the transaction returns.\n\n# 0.34\n\n## Improvements\n\n* #1132 implemented From<sled::Error> for io::Error to\n  reduce friction in some situations.\n\n## Breaking Changes\n\n* #1131 transactions performed on `Tree`s from different\n  `Db`s will now safely fail.\n* #1131 transactions may now only be performed on tuples\n  of up to 14 elements. For higher numbers, please use\n  slices.\n\n# 0.33\n\n## Breaking Changes\n\n* #1125 the backtrace crate has been made optional, which\n  cuts several seconds off compilation time, but may cause\n  breakage if you interacted with the backtrace field\n  of corruption-related errors.\n\n## Bug Fixes\n\n* #1128 `Tree::pop_min` and `Tree::pop_max` had a bug where\n  they were not atomic.\n\n# 0.32.1\n\n## New Features\n\n* #1116 `IVec::subslice` has been added to facilitate\n  creating zero-copy subsliced `IVec`s that are backed\n  by the same data.\n\n## Bug Fixes\n\n* #1120 Fixed a use-after-free caused by missing `ref` keyword\n  on a `Copy` type in a pattern match in `IVec::as_mut`.\n* #1108 conversions from `Box<[u8]>` to `IVec` are fixed.\n\n# 0.32\n\n## New Features\n\n* #1079 `Transactional` is now implemented for\n  `[&Tree]` and `[Tree]` so you can avoid the\n  previous friction of using tuples, as was\n  necessary previously.\n* #1058 The minimum supported Rust version (MSRV)\n  is now 1.39.0.\n* #1037 `Subscriber` now implements `Future` (non-fused)\n  so prefix watching may now be iterated over via\n  `while let Some(event) = (&mut subscriber).await {}`\n\n## Improvements\n\n* #965 concurrency control is now dynamically enabled\n  for atomic point operations, so that it may be\n  avoided unless transactional functionality is\n  being used in the system. This significantly\n  increases performance for workloads that do not\n  use transactions.\n* A number of memory optimizations have been implemented.\n* Disk usage has been significantly reduced for many\n  workloads.\n* #1016 On 64-bit systems, we can now store 1-2 trillion items.\n* #993 Added DerefMut and AsMut<[u8]> for `IVec` where it\n  works similarly to a `Cow`, making a private copy\n  if the backing `Arc`'s strong count is not 1.\n* #1020 The sled wiki has been moved into the documentation\n  itself, and is accessible through the `doc` module\n  exported in lib.\n\n## Breaking Changes\n\n* #975 Changed the default `segment_size` from 8m to 512k.\n  This will result in far smaller database files due\n  to better file garbage collection granularity.\n* #975 deprecated several `Config` options that will be\n  removed over time.\n* #1000 rearranged some transaction-related imports, and\n  moved them to the `transaction` module away from\n  the library root to keep the top level docs clean.\n* #1015 `TransactionalTree::apply_batch` now accepts\n  its argument by reference instead of by value.\n* `Event` has been changed to make the inner fields\n  named instead of anonymous.\n* #1057 read-only mode has been removed due to not having\n  the resources to properly keep it tested while\n  making progress on high priority issues. This may\n  be correctly implemented in the future if resources\n  permit.\n* The conversion between `Box<[u8]>` and `IVec` has\n  been temporarily removed. This is re-added in 0.32.1.\n\n# 0.31\n\n## Improvements\n\n* #947 dramatic read and recovery optimizations\n* #921 reduced the reliance on locks while\n  performing multithreaded IO on windows.\n* #928 use `sync_file_range` on linux instead\n  of a full fsync for most writes.\n* #946 io_uring support changed to the `rio` crate\n* #939 reduced memory consumption during\n  zstd decompression\n\n## Breaking Changes\n\n* #927 use SQLite-style varints for serializing\n  `u64`. This dramatically reduces the written\n  bytes for databases that store small keys and\n  values.\n* #943 use varints for most of the fields in\n  message headers, causing an additional large\n  space reduction. combined with #927, these\n  changes reduce bytes written by 68% for workloads\n  writing small items.\n\n# 0.30.3\n\n* Documentation-only release\n\n# 0.30.2\n\n## New Features\n\n* Added the `open` function for quickly\n  opening a database at a path with default\n  configuration.\n\n# 0.30.1\n\n## Bugfixes\n\n* Fixed an issue where an idle threadpool worker\n  would spin in a hot loop until work arrived\n\n# 0.30\n\n## Breaking Changes\n\n* Migrated to a new storage format\n\n## Bugfixes\n\n* Fixed a bug where cache was not being evicted.\n* Fixed a bug with using transactions with\n  compression.\n\n# 0.29.2\n\n## New Features\n\n* The `create_new` option has been added\n  to `Config`, allowing the user to specify\n  that a database should only be freshly\n  created, rather than re-opened.\n\n# 0.29.1\n\n## Bugfixes\n\n* Fixed a bug where prefix encoding could be\n  incorrectly handled when merging nodes together.\n\n# 0.29\n\n## New Features\n\n* The `Config::open` method has been added to give\n  `Config` a similar feel to std's `fs::OpenOptions`.\n  The `Config::build` and `Db::start` methods are\n  now deprecated in favor of calling `Config::open`\n  directly.\n* A `checksum` method has been added to Tree and Db\n  for use in verifying backups and migrations.\n* Transactions may now involve up to 69 different\n  tables. Nice.\n* The `TransactionError::Abort` variant has had\n  a generic member added that can be returned\n  as a way to return information from a\n  manually-aborted transaction. An `abort` helper\n  function has been added to reduce the boiler-\n  plate required to return aborted results.\n\n## Breaking Changes\n\n* The `ConfigBuilder` structure has been removed\n  in favor of a simplified `Config` structure\n  with the same functionality.\n* The way that sled versions are detected at\n  initialization time is now independent of serde.\n* The `cas` method is deprecated in favor of the new\n  `compare_and_swap` method which now returns the\n  proposed value that failed to be applied.\n* Tree nodes now have constant prefix encoding\n  lengths.\n* The `io_buf_size` configurable renamed to\n  `segment_size`.\n* The `io_buf_size` configurable method has been\n  removed from ConfigBuilder. This can be manually\n  set by setting the attribute directly on the\n  ConfigBuilder, but this is discouraged.\n  Additionally, this must now be a power of 2.\n* The `page_consolidation_threshold` method has been\n  removed from ConfigBuilder, and this is now\n  a constant of 10.\n\n# 0.28\n\n## Breaking Changes\n\n* `Iter` no longer has a lifetime parameter.\n* `Db::open_tree` now returns a `Tree` instead of\n  an `Arc<Tree>`. `Tree` now has an inner type that\n  uses an `Arc`, so you don't need to think about it.\n\n## Bug Fixes\n\n* A bug with prefix encoding has been fixed that\n  led to nodes with keys longer than 256 bytes\n  being stored incorrectly, which led to them\n  being inaccessible and also leading to infinite\n  loops during iteration.\n* Several cases of incorrect unsafe code were removed\n  from the sled crate. No bugs are known to have been\n  encountered, but they may have resulted in\n  incorrect optimizations in future refactors.\n\n# 0.27\n\n## Breaking Changes\n\n* `Event::Set` has been renamed to `Event::Insert` and\n  `Event::Del` has been renamed to `Event::Remove`. These\n  names better align with the methods of BTreeMap from\n  the standard library.\n\n## Bug Fixes\n\n* A deadlock was possible in very high write volume\n  situations when the segment accountant lock was\n  taken by all IO threads while a task was blocked\n  trying to submit a file truncation request to the\n  threadpool while holding the segment accountant lock.\n\n## New Features\n\n* `flush_async` has been added to perform time-intensive\n  flushing in an asynchronous manner, returning a Future.\n\n# 0.26.1\n\n## Improvements\n\n* std::thread is no longer used on platforms other than\n  linux, macos, and windows, which increases portability.\n\n# 0.26\n\n## New Features\n\n* Transactions! You may now call `Tree::transaction` and\n  perform reads, writes, and deletes within a provided\n  closure with a `TransactionalTree` argument. This\n  closure may be called multiple times if the transaction\n  encounters a concurrent update in the process of its\n  execution. Transactions may also be used on tuples of\n  `Tree` objects, where the closure will then be\n  parameterized on `TransactionalTree` instances providing\n  access to each of the provided `Tree` instances. This\n  allows you to atomically read and modify multiple\n  `Tree` instances in a single atomic operation.\n  These transactions are serializable, fully ACID,\n  and optimistic.\n* `Tree::apply_batch` allows you to apply a `Batch`\n* `TransactionalTree::apply_batch` allow you to\n  apply a `Batch` from within a transaction.\n\n## Breaking Changes\n\n* `Tree::batch` has been removed. Now you can directly\n  create a `Batch` with `Batch::default()` and then apply\n  it to a `Tree` with `Tree::apply_batch` or during a\n  transaction using `TransactionalTree::apply_batch`.\n  This facilitates multi-`Tree` batches via transactions.\n* `Event::Merge` has been removed, and `Tree::merge` will\n  now send a complete `Event::Set` item to be distributed\n  to all listening subscribers.\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Welcome to the Project :)\n\n* Don't be a jerk - here's our [code of conduct](./code-of-conduct.md).\n  We have a track record of defending our community from harm.\n\nThere are at least three great ways to contribute to sled:\n\n* [financial contribution](https://github.com/sponsors/spacejam)\n* coding\n* conversation\n\n#### Coding Considerations:\n\nPlease don't waste your time or ours by implementing things that\nwe do not want to introduce and maintain. Please discuss in an\nissue or on chat before submitting a PR with:\n\n* public API changes\n* new functionality of any sort\n* additional unsafe code\n* significant refactoring\n\nThe above changes are unlikely to be merged or receive\ntimely attention without prior discussion.\n\nPRs that generally require less coordination beforehand:\n\n* Anything addressing a correctness issue.\n* Better docs: whatever you find confusing!\n* Small code changes with big performance implications, substantiated with [responsibly-gathered metrics](https://sled.rs/perf#experiment-checklist).\n* FFI submodule changes: these are generally less well maintained than the Rust core, and benefit more from public assistance.\n* Generally any new kind of test that avoids biases inherent in the others.\n\n#### All PRs block on failing tests!\n\nsled has intense testing, including crash tests, multi-threaded tests with\ndelay injection, a variety of mechanically-generated tests that combine fault\ninjection with concurrency in interesting ways, cross-compilation and minimum\nsupported Rust version checks, LLVM sanitizers, and more. It can sometimes be\nchallenging to understand why something is failing these intense tests.\n\nFor better understanding test failures, please:\n\n1. read the failing test name and output log for clues\n1. try to reproduce the failed test locally by running its associated command from the [test script](https://github.com/spacejam/sled/blob/main/.github/workflows/test.yml)\n1. If it is not clear why your test is failing, feel free to request help with understanding it either on discord or requesting help on the PR, and we will do our best to help.\n\nWant to help sled but don't have time for individual contributions? Contribute via [GitHub Sponsors](https://github.com/sponsors/spacejam) to support the people pushing the project forward!\n"
  },
  {
    "path": "Cargo.toml",
    "content": "[package]\nname = \"sled\"\nversion = \"1.0.0-alpha.124\"\nedition = \"2024\"\nauthors = [\"Tyler Neely <tylerneely@gmail.com>\"]\ndocumentation = \"https://docs.rs/sled/\"\ndescription = \"Lightweight high-performance pure-rust transactional embedded database.\"\nlicense = \"MIT OR Apache-2.0\"\nhomepage = \"https://github.com/spacejam/sled\"\nrepository = \"https://github.com/spacejam/sled\"\nkeywords = [\"redis\", \"mongo\", \"sqlite\", \"lmdb\", \"rocksdb\"]\ncategories = [\"database-implementations\", \"concurrency\", \"data-structures\", \"algorithms\", \"caching\"]\nreadme = \"README.md\"\nexclude = [\"benchmarks\", \"examples\", \"bindings\", \"scripts\", \"experiments\"]\n\n[features]\n# initializes allocated memory to 0xa1, writes 0xde to deallocated memory before freeing it\ntesting-shred-allocator = []\n# use a counting global allocator that provides the sled::alloc::{allocated, freed, resident, reset} functions\ntesting-count-allocator = []\nfor-internal-testing-only = []\n# turn off re-use of object IDs and heap slots, disable tree leaf merges, disable heap file truncation.\nmonotonic-behavior = []\n\n[profile.release]\ndebug = true\nopt-level = 3\noverflow-checks = true\npanic = \"abort\"\n\n[profile.test]\ndebug = true\noverflow-checks = true\npanic = \"abort\"\n\n[dependencies]\nbincode = \"1.3.3\"\ncache-advisor = \"1.0.16\"\nconcurrent-map = { version = \"5.0.31\", features = [\"serde\"] }\ncrc32fast = \"1.3.2\"\nebr = \"0.2.13\"\ninline-array = { version = \"0.1.13\", features = [\"serde\", \"concurrent_map_minimum\"] }\nfs2 = \"0.4.3\"\nlog = \"0.4.19\"\npagetable = \"0.4.5\"\nparking_lot = { version = \"0.12.1\", features = [\"arc_lock\"] }\nrayon = \"1.7.0\"\nserde = { version = \"1.0\", features = [\"derive\"] }\nstack-map = { version = \"1.0.5\", features = [\"serde\"] }\nzstd = \"0.12.4\"\nfnv = \"1.0.7\"\nfault-injection = \"1.0.10\"\ncrossbeam-queue = \"0.3.8\"\ncrossbeam-channel = \"0.5.8\"\ntempdir = \"0.3.7\"\n\n[dev-dependencies]\nenv_logger = \"0.10.0\"\nnum-format = \"0.4.4\"\n# heed = \"0.11.0\"\n# rocksdb = \"0.21.0\"\n# rusqlite = \"0.29.0\"\n# old_sled = { version = \"0.34\", package = \"sled\" }\nrand = \"0.9\"\nquickcheck = \"1.0.3\"\nrand_distr = \"0.5\"\nlibc = \"0.2.147\"\n\n[[test]]\nname = \"test_crash_recovery\"\npath = \"tests/test_crash_recovery.rs\"\nharness = false\n\n"
  },
  {
    "path": "LICENSE-APACHE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"{}\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2015 Tyler Neely\n   Copyright 2016 Tyler Neely\n   Copyright 2017 Tyler Neely\n   Copyright 2018 Tyler Neely\n   Copyright 2019 Tyler Neely\n   Copyright 2020 Tyler Neely\n   Copyright 2021 Tyler Neely\n   Copyright 2022 Tyler Neely\n   Copyright 2023 Tyler Neely\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "LICENSE-MIT",
    "content": "Copyright (c) 2015 Tyler Neely\nCopyright (c) 2016 Tyler Neely\nCopyright (c) 2017 Tyler Neely\nCopyright (c) 2018 Tyler Neely\nCopyright (c) 2019 Tyler Neely\nCopyright (c) 2020 Tyler Neely\nCopyright (c) 2021 Tyler Neely\nCopyright (c) 2022 Tyler Neely\nCopyright (c) 2023 Tyler Neely\n\nPermission is hereby granted, free of charge, to any\nperson obtaining a copy of this software and associated\ndocumentation files (the \"Software\"), to deal in the\nSoftware without restriction, including without\nlimitation the rights to use, copy, modify, merge,\npublish, distribute, sublicense, and/or sell copies of\nthe Software, and to permit persons to whom the Software\nis furnished to do so, subject to the following\nconditions:\n\nThe above copyright notice and this permission notice\nshall be included in all copies or substantial portions\nof the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF\nANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED\nTO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A\nPARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT\nSHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\nCLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION\nOF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR\nIN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\nDEALINGS IN THE SOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "\n<table style=\"width:100%\">\n<tr>\n  <td>\n    <table style=\"width:100%\">\n      <tr>\n        <td> key </td>\n        <td> value </td>\n      </tr>\n      <tr>\n        <td><a href=\"https://docs.rs/sled\">documentation</a></td>\n        <td><a href=\"https://docs.rs/sled\"><img src=\"https://docs.rs/sled/badge.svg\"></a></td>\n      </tr>\n      <tr>\n        <td><a href=\"https://discord.gg/Z6VsXds\">chat about databases with us</a></td>\n        <td><a href=\"https://discord.gg/Z6VsXds\"><img src=\"https://img.shields.io/discord/509773073294295082.svg?logo=discord\"></a></td>\n      </tr>\n     </table>\n  </td>\n  <td>\n<p align=\"center\">\n  <img src=\"https://raw.githubusercontent.com/spacejam/sled/main/art/tree_face_anti-transphobia.png\" width=\"40%\" height=\"auto\" />\n  </p>\n  </td>\n </tr>\n</table>\n\n\n# sled - ~~it's all downhill from here!!!~~\n\nAn embedded database.\n\n```rust\nlet tree = sled::open(\"/tmp/welcome-to-sled\")?;\n\n// insert and get, similar to std's BTreeMap\nlet old_value = tree.insert(\"key\", \"value\")?;\n\nassert_eq!(\n  tree.get(&\"key\")?,\n  Some(sled::IVec::from(\"value\")),\n);\n\n// range queries\nfor kv_result in tree.range(\"key_1\"..\"key_9\") {}\n\n// deletion\nlet old_value = tree.remove(&\"key\")?;\n\n// atomic compare and swap\ntree.compare_and_swap(\n  \"key\",\n  Some(\"current_value\"),\n  Some(\"new_value\"),\n)?;\n\n// block until all operations are stable on disk\n// (flush_async also available to get a Future)\ntree.flush()?;\n```\n\n$${\\color{red}This \\space README \\space is \\space out \\space of \\space sync \\space with \\space the \\space main \\space branch \\space which \\space contains \\space a \\space large \\space in-progress \\space rewrite }$$\n\nIf you would like to work with structured data without paying expensive deserialization costs, check out the [structured](examples/structured.rs) example!\n\n# features\n\n* [API](https://docs.rs/sled) similar to a threadsafe `BTreeMap<[u8], [u8]>`\n* serializable (ACID) [transactions](https://docs.rs/sled/latest/sled/struct.Tree.html#method.transaction)\n  for atomically reading and writing to multiple keys in multiple keyspaces.\n* fully atomic single-key operations, including [compare and swap](https://docs.rs/sled/latest/sled/struct.Tree.html#method.compare_and_swap)\n* zero-copy reads\n* [write batches](https://docs.rs/sled/latest/sled/struct.Tree.html#method.apply_batch)\n* [subscribe to changes on key\n  prefixes](https://docs.rs/sled/latest/sled/struct.Tree.html#method.watch_prefix)\n* [multiple keyspaces](https://docs.rs/sled/latest/sled/struct.Db.html#method.open_tree)\n* [merge operators](https://docs.rs/sled/latest/sled/doc/merge_operators/index.html)\n* forward and reverse iterators over ranges of items\n* a crash-safe monotonic [ID generator](https://docs.rs/sled/latest/sled/struct.Db.html#method.generate_id)\n  capable of generating 75-125 million unique ID's per second\n* [zstd](https://github.com/facebook/zstd) compression (use the\n  `compression` build feature, disabled by default)\n* cpu-scalable lock-free implementation\n* flash-optimized log-structured storage\n* uses modern b-tree techniques such as prefix encoding and suffix\n  truncation for reducing the storage costs of long keys with shared\n  prefixes. If keys are the same length and sequential then the\n  system can avoid storing 99%+ of the key data in most cases,\n  essentially acting like a learned index\n\n# expectations, gotchas, advice\n\n* Maybe one of the first things that seems weird is the `IVec` type.\n  This is an inlinable `Arc`ed slice that makes some things more efficient.\n* Durability: **sled automatically fsyncs every 500ms by default**,\n  which can be configured with the `flush_every_ms` configurable, or you may\n  call `flush` / `flush_async` manually after operations.\n* **Transactions are optimistic** - do not interact with external state\n  or perform IO from within a transaction closure unless it is\n  [idempotent](https://en.wikipedia.org/wiki/Idempotent).\n* Internal tree node optimizations: sled performs prefix encoding\n  on long keys with similar prefixes that are grouped together in a range,\n  as well as suffix truncation to further reduce the indexing costs of\n  long keys. Nodes will skip potentially expensive length and offset pointers\n  if keys or values are all the same length (tracked separately, don't worry\n  about making keys the same length as values), so it may improve space usage\n  slightly if you use fixed-length keys or values. This also makes it easier\n  to use [structured access](examples/structured.rs) as well.\n* sled does not support multiple open instances for the time being. Please\n  keep sled open for the duration of your process's lifespan. It's totally\n  safe and often quite convenient to use a global lazy_static sled instance,\n  modulo the normal global variable trade-offs. Every operation is threadsafe,\n  and most are implemented under the hood with lock-free algorithms that avoid\n  blocking in hot paths.\n\n# performance\n\n* [LSM tree](https://en.wikipedia.org/wiki/Log-structured_merge-tree)-like write performance\n  with [traditional B+ tree](https://en.wikipedia.org/wiki/B%2B_tree)-like read performance\n* over a billion operations in under a minute at 95% read 5% writes on 16 cores on a small dataset\n* measure your own workloads rather than relying on some marketing for contrived workloads\n\n# a note on lexicographic ordering and endianness\n\nIf you want to store numerical keys in a way that will play nicely with sled's iterators and ordered operations, please remember to store your numerical items in big-endian form. Little endian (the default of many things) will often appear to be doing the right thing until you start working with more than 256 items (more than 1 byte), causing lexicographic ordering of the serialized bytes to diverge from the lexicographic ordering of their deserialized numerical form.\n\n* Rust integral types have built-in `to_be_bytes` and `from_be_bytes` [methods](https://doc.rust-lang.org/std/primitive.u64.html#method.from_be_bytes).\n* bincode [can be configured](https://docs.rs/bincode/1.2.0/bincode/struct.Config.html#method.big_endian) to store integral types in big-endian form.\n\n# interaction with async\n\nIf your dataset resides entirely in cache (achievable at startup by setting the cache\nto a large enough value and performing a full iteration) then all reads and writes are\nnon-blocking and async-friendly, without needing to use Futures or an async runtime.\n\nTo asynchronously suspend your async task on the durability of writes, we support the\n[`flush_async` method](https://docs.rs/sled/latest/sled/struct.Tree.html#method.flush_async),\nwhich returns a Future that your async tasks can await the completion of if they require\nhigh durability guarantees and you are willing to pay the latency costs of fsync.\nNote that sled automatically tries to sync all data to disk several times per second\nin the background without blocking user threads.\n\nWe support async subscription to events that happen on key prefixes, because the\n`Subscriber` struct implements `Future<Output=Option<Event>>`:\n\n```rust\nlet sled = sled::open(\"my_db\").unwrap();\n\nlet mut sub = sled.watch_prefix(\"\");\n\nsled.insert(b\"a\", b\"a\").unwrap();\n\nextreme::run(async move {\n    while let Some(event) = (&mut sub).await {\n        println!(\"got event {:?}\", event);\n    }\n});\n```\n\n# minimum supported Rust version (MSRV)\n\nWe support Rust 1.62 and up.\n\n# architecture\n\nlock-free tree on a lock-free pagecache on a lock-free log. the pagecache scatters\npartial page fragments across the log, rather than rewriting entire pages at a time\nas B+ trees for spinning disks historically have. on page reads, we concurrently\nscatter-gather reads across the log to materialize the page from its fragments.\ncheck out the [architectural outlook](https://github.com/spacejam/sled/wiki/sled-architectural-outlook)\nfor a more detailed overview of where we're at and where we see things going!\n\n# philosophy\n\n1. don't make the user think. the interface should be obvious.\n1. don't surprise users with performance traps.\n1. don't wake up operators. bring reliability techniques from academia into real-world practice.\n1. don't use so much electricity. our data structures should play to modern hardware's strengths.\n\n# known issues, warnings\n\n* if reliability is your primary constraint, use SQLite. sled is beta.\n* if storage price performance is your primary constraint, use RocksDB. sled uses too much space sometimes.\n* if you have a multi-process workload that rarely writes, use LMDB. sled is architected for use with long-running, highly-concurrent workloads such as stateful services or higher-level databases.\n* quite young, should be considered unstable for the time being.\n* the on-disk format is going to change in ways that require [manual migrations](https://docs.rs/sled/latest/sled/struct.Db.html#method.export) before the `1.0.0` release!\n\n# priorities\n\n1. A full rewrite of sled's storage subsystem is happening on a modular basis as part of the [komora project](https://github.com/komora-io), in particular the marble storage engine. This will dramatically lower both the disk space usage (space amplification) and garbage collection overhead (write amplification) of sled.\n2. The memory layout of tree nodes is being completely rewritten to reduce fragmentation and eliminate serialization costs.\n3. The merge operator feature will change into a trigger feature that resembles traditional database triggers, allowing state to be modified as part of the same atomic writebatch that triggered it for retaining serializability with reactive semantics.\n\n# fund feature development\n\nLike what we're doing? Help us out via [GitHub Sponsors](https://github.com/sponsors/spacejam)!\n"
  },
  {
    "path": "RELEASE_CHECKLIST.md",
    "content": "# Release Checklist\n\nThis checklist must be completed before publishing a release of any kind.\n\nOver time, anything in this list that can be turned into an automated test should be, but\nthere are still some big blind spots.\n\n## API stability\n\n- [ ] rust-flavored semver respected\n\n## Performance\n\n- [ ] micro-benchmark regressions should not happen unless newly discovered correctness criteria demands them\n- [ ] mixed point operation latency distribution should narrow over time\n- [ ] sequential operation average throughput should increase over time\n- [ ] workloads should pass TSAN and ASAN on macOS. Linux should additionally pass LSAN & MSAN.\n- [ ] workload write and space amplification thresholds should see no regressions\n\n## Concurrency Audit\n\n- [ ] any new `Guard` objects are dropped inside the rayon threadpool\n- [ ] no new EBR `Collector`s, as they destroy causality. These will be optimized in-bulk in the future.\n- [ ] no code assumes a recently read page pointer will remain unchanged (transactions may change this if reads are inline)\n- [ ] no calls to `rand::thread_rng` from a droppable function (anything in the SegmentAccountant)\n\n## Burn-In\n\n- [ ] fuzz tests should run at least 24 hours each with zero crashes\n- [ ] sequential and point workloads run at least 24 hours in constrained docker container without OOM / out of disk\n"
  },
  {
    "path": "SAFETY.md",
    "content": "# sled safety model\n\nThis document applies\n[STPA](http://psas.scripts.mit.edu/home/get_file.php?name=STPA_handbook.pdf)-style\nhazard analysis to the sled embedded database for the purpose of guiding\ndesign and testing efforts to prevent unacceptable losses.\n\nOutline\n\n* [purpose of analysis](#purpose-of-analysis)\n  * [losses](#losses)\n  * [system boundary](#system-boundary)\n  * [hazards](#hazards)\n  * [leading indicators](#leading-indicators)\n  * [constraints](#constraints)\n* [model of control structure](#model-of-control-structure)\n* [identify unsafe control actions](#identify-unsafe-control-actions)\n* [identify loss scenarios][#identify-loss-scenarios)\n* [resources for learning more about STAMP, STPA, and CAST](#resources)\n\n# Purpose of Analysis\n\n## Losses\n\nWe wish to prevent the following undesirable situations:\n\n* data loss\n* inconsistent (non-linearizable) data access\n* process crash\n* resource exhaustion\n\n## System Boundary\n\nWe draw the line between system and environment where we can reasonably\ninvest our efforts to prevent losses.\n\nInside the boundary:\n\n* codebase\n  * put safe control actions into place that prevent losses\n* documentation\n  * show users how to use sled safely\n  * recommend hardware, kernels, user code\n\nOutside the boundary:\n\n* Direct changes to hardware, kernels, user code\n\n## Hazards\n\nThese hazards can result in the above losses:\n\n* data may be lost if\n  * bugs in the logging system\n    * `Db::flush` fails to make previous writes durable\n  * bugs in the GC system\n    * the old location is overwritten before the defragmented location becomes durable\n  * bugs in the recovery system\n  * hardare failures\n* consistency violations may be caused by\n  * transaction concurrency control failure to enforce linearizability (strict serializability)\n  * non-linearizable lock-free single-key operations\n* panic\n  * of user threads\n  * IO threads\n  * flusher & GC thread\n  * indexing\n  * unwraps/expects\n  * failed TryInto/TryFrom + unwrap\n* persistent storage exceeding (2 + N concurrent writers) * logical data size\n* in-memory cache exceeding the configured cache size\n  * caused by incorrect calculation of cache\n* use-after-free\n* data race\n* memory leak\n* integer overflow\n* buffer overrun\n* uninitialized memory access\n\n## Constraints\n\n# Models of Control Structures\n\nfor each control action we have, consider:\n\n1. what hazards happen when we fail to apply it / it does not exist?\n2. what hazards happen when we do apply it\n3. what hazards happen when we apply it too early or too late?\n4. what hazards happen if we apply it for too long or not long enough?\n\ndurability model\n\n  * recovery\n    * LogIter::max_lsn\n      * return None if last_lsn_in_batch >= self.max_lsn\n    * batch requirement set to last reservation base + inline len - 1\n      * reserve bumps\n        * bump_atomic_lsn(&self.iobufs.max_reserved_lsn, reservation_lsn + inline_buf_len as Lsn - 1);\n\nlock-free linearizability model\n\ntransactional linearizability (strict serializability) model\n\npanic model\n\nmemory usage model\n\nstorage usage model\n\n"
  },
  {
    "path": "SECURITY.md",
    "content": "# Security Policy\n\n## Reporting a Vulnerability\n\nsled uses some unsafe functionality in the core lock-free algorithms, and in a few places to more efficiently copy data.\n\nPlease contact [Tyler Neely](mailto:tylerneely@gmail.com?subject=sled%20security%20issue) immediately if you find any vulnerability, and I will work with you to fix the issue rapidly and coordinate public disclosure with an expedited release including the fix.\n\nIf you are a bug hunter or a person with a security interest, here is my mental model of memory corruption risk in the sled codebase:\n\n1. memory issues relating to the lock-free data structures in their colder failure paths. these have been tested a bit by injecting delays into random places, but this is still an area with elevated risk\n1. anywhere the `unsafe` keyword is used\n"
  },
  {
    "path": "art/CREDITS",
    "content": "original tree logo with face:\n  https://twitter.com/daiyitastic\n\nanti-transphobia additions:\n  spacejam\n"
  },
  {
    "path": "code-of-conduct.md",
    "content": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nIn the interest of fostering an open and welcoming environment, we as\ncontributors and maintainers pledge to making participation in our project and\nour community a harassment-free experience for everyone, regardless of age, body\nsize, disability, ethnicity, sex characteristics, gender identity and expression,\nlevel of experience, education, socio-economic status, nationality, personal\nappearance, race, religion, or sexual identity and orientation.\n\n## Our Standards\n\nExamples of behavior that contributes to creating a positive environment\ninclude:\n\n* Using welcoming and inclusive language\n* Being respectful of differing viewpoints and experiences\n* Gracefully accepting constructive criticism\n* Focusing on what is best for the community\n* Showing empathy towards other community members\n\nExamples of unacceptable behavior by participants include:\n\n* The use of sexualized language or imagery and unwelcome sexual attention or\n  advances\n* Trolling, insulting/derogatory comments, and personal or political attacks\n* Public or private harassment\n* Publishing others' private information, such as a physical or electronic\n  address, without explicit permission\n* Other conduct which could reasonably be considered inappropriate in a\n  professional setting\n\n## Our Responsibilities\n\nProject maintainers are responsible for clarifying the standards of acceptable\nbehavior and are expected to take appropriate and fair corrective action in\nresponse to any instances of unacceptable behavior.\n\nProject maintainers have the right and responsibility to remove, edit, or\nreject comments, commits, code, wiki edits, issues, and other contributions\nthat are not aligned to this Code of Conduct, or to ban temporarily or\npermanently any contributor for other behaviors that they deem inappropriate,\nthreatening, offensive, or harmful.\n\n## Scope\n\nThis Code of Conduct applies both within project spaces and in public spaces\nwhen an individual is representing the project or its community. Examples of\nrepresenting a project or community include using an official project e-mail\naddress, posting via an official social media account, or acting as an appointed\nrepresentative at an online or offline event. Representation of a project may be\nfurther defined and clarified by project maintainers.\n\n## Enforcement\n\nInstances of abusive, harassing, or otherwise unacceptable behavior may be\nreported by contacting the project team at tylerneely@gmail.com. All\ncomplaints will be reviewed and investigated and will result in a response that\nis deemed necessary and appropriate to the circumstances. The project team is\nobligated to maintain confidentiality with regard to the reporter of an incident.\nFurther details of specific enforcement policies may be posted separately.\n\nProject maintainers who do not follow or enforce the Code of Conduct in good\nfaith may face temporary or permanent repercussions as determined by other\nmembers of the project's leadership.\n\n## Attribution\n\nThis Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,\navailable at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html\n\n[homepage]: https://www.contributor-covenant.org\n\n"
  },
  {
    "path": "examples/bench.rs",
    "content": "use std::path::Path;\nuse std::sync::Barrier;\nuse std::thread::scope;\nuse std::time::{Duration, Instant};\nuse std::{fs, io};\n\nuse num_format::{Locale, ToFormattedString};\n\nuse sled::{Config, Db as SledDb};\n\ntype Db = SledDb<1024>;\n\nconst N_WRITES_PER_THREAD: u32 = 4 * 1024 * 1024;\nconst MAX_CONCURRENCY: u32 = 4;\nconst CONCURRENCY: &[usize] = &[/*1, 2, 4,*/ MAX_CONCURRENCY as _];\nconst BYTES_PER_ITEM: u32 = 8;\n\ntrait Databench: Clone + Send {\n    type READ: AsRef<[u8]>;\n    const NAME: &'static str;\n    const PATH: &'static str;\n    fn open() -> Self;\n    fn remove_generic(&self, key: &[u8]);\n    fn insert_generic(&self, key: &[u8], value: &[u8]);\n    fn get_generic(&self, key: &[u8]) -> Option<Self::READ>;\n    fn flush_generic(&self);\n    fn print_stats(&self);\n}\n\nimpl Databench for Db {\n    type READ = sled::InlineArray;\n\n    const NAME: &'static str = \"sled 1.0.0-alpha\";\n    const PATH: &'static str = \"timing_test.sled-new\";\n\n    fn open() -> Self {\n        sled::Config {\n            path: Self::PATH.into(),\n            zstd_compression_level: 3,\n            cache_capacity_bytes: 1024 * 1024 * 1024,\n            entry_cache_percent: 20,\n            flush_every_ms: Some(200),\n            ..Config::default()\n        }\n        .open()\n        .unwrap()\n    }\n\n    fn insert_generic(&self, key: &[u8], value: &[u8]) {\n        self.insert(key, value).unwrap();\n    }\n    fn remove_generic(&self, key: &[u8]) {\n        self.remove(key).unwrap();\n    }\n    fn get_generic(&self, key: &[u8]) -> Option<Self::READ> {\n        self.get(key).unwrap()\n    }\n    fn flush_generic(&self) {\n        self.flush().unwrap();\n    }\n    fn print_stats(&self) {\n        dbg!(self.stats());\n    }\n}\n\n/*\nimpl Databench for old_sled::Db {\n    type READ = old_sled::IVec;\n\n    const NAME: &'static str = \"sled 0.34.7\";\n    const PATH: &'static str = \"timing_test.sled-old\";\n\n    fn open() -> Self {\n        old_sled::open(Self::PATH).unwrap()\n    }\n    fn insert_generic(&self, key: &[u8], value: &[u8]) {\n        self.insert(key, value).unwrap();\n    }\n    fn get_generic(&self, key: &[u8]) -> Option<Self::READ> {\n        self.get(key).unwrap()\n    }\n    fn flush_generic(&self) {\n        self.flush().unwrap();\n    }\n}\n*/\n\n/*\nimpl Databench for Arc<rocksdb::DB> {\n    type READ = Vec<u8>;\n\n    const NAME: &'static str = \"rocksdb 0.21.0\";\n    const PATH: &'static str = \"timing_test.rocksdb\";\n\n    fn open() -> Self {\n        Arc::new(rocksdb::DB::open_default(Self::PATH).unwrap())\n    }\n    fn insert_generic(&self, key: &[u8], value: &[u8]) {\n        self.put(key, value).unwrap();\n    }\n    fn get_generic(&self, key: &[u8]) -> Option<Self::READ> {\n        self.get(key).unwrap()\n    }\n    fn flush_generic(&self) {\n        self.flush().unwrap();\n    }\n}\n*/\n\n/*\nstruct Lmdb {\n    env: heed::Env,\n    db: heed::Database<\n        heed::types::UnalignedSlice<u8>,\n        heed::types::UnalignedSlice<u8>,\n    >,\n}\n\nimpl Clone for Lmdb {\n    fn clone(&self) -> Lmdb {\n        Lmdb { env: self.env.clone(), db: self.db.clone() }\n    }\n}\n\nimpl Databench for Lmdb {\n    type READ = Vec<u8>;\n\n    const NAME: &'static str = \"lmdb\";\n    const PATH: &'static str = \"timing_test.lmdb\";\n\n    fn open() -> Self {\n        let _ = std::fs::create_dir_all(Self::PATH);\n        let env = heed::EnvOpenOptions::new()\n            .map_size(1024 * 1024 * 1024)\n            .open(Self::PATH)\n            .unwrap();\n        let db = env.create_database(None).unwrap();\n        Lmdb { env, db }\n    }\n    fn insert_generic(&self, key: &[u8], value: &[u8]) {\n        let mut wtxn = self.env.write_txn().unwrap();\n        self.db.put(&mut wtxn, key, value).unwrap();\n        wtxn.commit().unwrap();\n    }\n    fn get_generic(&self, key: &[u8]) -> Option<Self::READ> {\n        let rtxn = self.env.read_txn().unwrap();\n        let ret = self.db.get(&rtxn, key).unwrap().map(Vec::from);\n        rtxn.commit().unwrap();\n        ret\n    }\n    fn flush_generic(&self) {\n        // NOOP\n    }\n}\n*/\n\n/*\nstruct Sqlite {\n    connection: rusqlite::Connection,\n}\n\nimpl Clone for Sqlite {\n    fn clone(&self) -> Sqlite {\n        Sqlite { connection: rusqlite::Connection::open(Self::PATH).unwrap() }\n    }\n}\n\nimpl Databench for Sqlite {\n    type READ = Vec<u8>;\n\n    const NAME: &'static str = \"sqlite\";\n    const PATH: &'static str = \"timing_test.sqlite\";\n\n    fn open() -> Self {\n        let connection = rusqlite::Connection::open(Self::PATH).unwrap();\n        connection\n            .execute(\n                \"create table if not exists bench (\n                     key integer primary key,\n                     val integer not null\n                 )\",\n                [],\n            )\n            .unwrap();\n        Sqlite { connection }\n    }\n    fn insert_generic(&self, key: &[u8], value: &[u8]) {\n        loop {\n            let res = self.connection.execute(\n                \"insert or ignore into bench (key, val) values (?1, ?2)\",\n                [\n                    format!(\"{}\", u32::from_be_bytes(key.try_into().unwrap())),\n                    format!(\n                        \"{}\",\n                        u32::from_be_bytes(value.try_into().unwrap())\n                    ),\n                ],\n            );\n            if res.is_ok() {\n                break;\n            }\n        }\n    }\n    fn get_generic(&self, key: &[u8]) -> Option<Self::READ> {\n        let mut stmt = self\n            .connection\n            .prepare(\"SELECT b.val from bench b WHERE key = ?1\")\n            .unwrap();\n        let mut rows =\n            stmt.query([u32::from_be_bytes(key.try_into().unwrap())]).unwrap();\n\n        let value = rows.next().unwrap()?;\n        value.get(0).ok()\n    }\n    fn flush_generic(&self) {\n        // NOOP\n    }\n}\n*/\n\nfn allocated() -> usize {\n    #[cfg(feature = \"testing-count-allocator\")]\n    {\n        return sled::alloc::allocated();\n    }\n    0\n}\n\nfn freed() -> usize {\n    #[cfg(feature = \"testing-count-allocator\")]\n    {\n        return sled::alloc::freed();\n    }\n    0\n}\n\nfn resident() -> usize {\n    #[cfg(feature = \"testing-count-allocator\")]\n    {\n        return sled::alloc::resident();\n    }\n    0\n}\n\nfn inserts<D: Databench>(store: &D) -> Vec<InsertStats> {\n    println!(\"{} inserts\", D::NAME);\n    let mut i = 0_u32;\n\n    let factory = move || {\n        i += 1;\n        (store.clone(), i - 1)\n    };\n\n    let f = |state: (D, u32)| {\n        let (store, offset) = state;\n        let start = N_WRITES_PER_THREAD * offset;\n        let end = N_WRITES_PER_THREAD * (offset + 1);\n        for i in start..end {\n            let k: &[u8] = &i.to_be_bytes();\n            store.insert_generic(k, k);\n        }\n    };\n\n    let mut ret = vec![];\n\n    for concurrency in CONCURRENCY {\n        let insert_elapsed =\n            execute_lockstep_concurrent(factory, f, *concurrency);\n\n        let flush_timer = Instant::now();\n        store.flush_generic();\n\n        let wps = (N_WRITES_PER_THREAD * *concurrency as u32) as u64\n            * 1_000_000_u64\n            / u64::try_from(insert_elapsed.as_micros().max(1))\n                .unwrap_or(u64::MAX);\n\n        ret.push(InsertStats {\n            thread_count: *concurrency,\n            inserts_per_second: wps,\n        });\n\n        println!(\n            \"{} inserts/s with {concurrency} threads over {:?}, then {:?} to flush {}\",\n            wps.to_formatted_string(&Locale::en),\n            insert_elapsed,\n            flush_timer.elapsed(),\n            D::NAME,\n        );\n    }\n\n    ret\n}\n\nfn removes<D: Databench>(store: &D) -> Vec<RemoveStats> {\n    println!(\"{} removals\", D::NAME);\n    let mut i = 0_u32;\n\n    let factory = move || {\n        i += 1;\n        (store.clone(), i - 1)\n    };\n\n    let f = |state: (D, u32)| {\n        let (store, offset) = state;\n        let start = N_WRITES_PER_THREAD * offset;\n        let end = N_WRITES_PER_THREAD * (offset + 1);\n        for i in start..end {\n            let k: &[u8] = &i.to_be_bytes();\n            store.remove_generic(k);\n        }\n    };\n\n    let mut ret = vec![];\n\n    for concurrency in CONCURRENCY {\n        let remove_elapsed =\n            execute_lockstep_concurrent(factory, f, *concurrency);\n\n        let flush_timer = Instant::now();\n        store.flush_generic();\n\n        let wps = (N_WRITES_PER_THREAD * *concurrency as u32) as u64\n            * 1_000_000_u64\n            / u64::try_from(remove_elapsed.as_micros().max(1))\n                .unwrap_or(u64::MAX);\n\n        ret.push(RemoveStats {\n            thread_count: *concurrency,\n            removes_per_second: wps,\n        });\n\n        println!(\n            \"{} removes/s with {concurrency} threads over {:?}, then {:?} to flush {}\",\n            wps.to_formatted_string(&Locale::en),\n            remove_elapsed,\n            flush_timer.elapsed(),\n            D::NAME,\n        );\n    }\n\n    ret\n}\n\nfn gets<D: Databench>(store: &D) -> Vec<GetStats> {\n    println!(\"{} reads\", D::NAME);\n\n    let factory = || store.clone();\n\n    let f = |store: D| {\n        let start = 0;\n        let end = N_WRITES_PER_THREAD * MAX_CONCURRENCY;\n        for i in start..end {\n            let k: &[u8] = &i.to_be_bytes();\n            store.get_generic(k);\n        }\n    };\n\n    let mut ret = vec![];\n\n    for concurrency in CONCURRENCY {\n        let get_stone_elapsed =\n            execute_lockstep_concurrent(factory, f, *concurrency);\n\n        let rps = (N_WRITES_PER_THREAD * MAX_CONCURRENCY * *concurrency as u32)\n            as u64\n            * 1_000_000_u64\n            / u64::try_from(get_stone_elapsed.as_micros().max(1))\n                .unwrap_or(u64::MAX);\n\n        ret.push(GetStats { thread_count: *concurrency, gets_per_second: rps });\n\n        println!(\n            \"{} gets/s with concurrency of {concurrency}, {:?} total reads {}\",\n            rps.to_formatted_string(&Locale::en),\n            get_stone_elapsed,\n            D::NAME\n        );\n    }\n    ret\n}\n\nfn execute_lockstep_concurrent<\n    State: Send,\n    Factory: FnMut() -> State,\n    F: Sync + Fn(State),\n>(\n    mut factory: Factory,\n    f: F,\n    concurrency: usize,\n) -> Duration {\n    let barrier = &Barrier::new(concurrency + 1);\n    let f = &f;\n\n    scope(|s| {\n        let mut threads = vec![];\n\n        for _ in 0..concurrency {\n            let state = factory();\n\n            let thread = s.spawn(move || {\n                barrier.wait();\n                f(state);\n            });\n\n            threads.push(thread);\n        }\n\n        barrier.wait();\n        let get_stone = Instant::now();\n\n        for thread in threads.into_iter() {\n            thread.join().unwrap();\n        }\n\n        get_stone.elapsed()\n    })\n}\n\n#[derive(Debug, Clone, Copy)]\nstruct InsertStats {\n    thread_count: usize,\n    inserts_per_second: u64,\n}\n\n#[derive(Debug, Clone, Copy)]\nstruct GetStats {\n    thread_count: usize,\n    gets_per_second: u64,\n}\n\n#[derive(Debug, Clone, Copy)]\nstruct RemoveStats {\n    thread_count: usize,\n    removes_per_second: u64,\n}\n\n#[allow(unused)]\n#[derive(Debug, Clone)]\nstruct Stats {\n    post_insert_disk_space: u64,\n    post_remove_disk_space: u64,\n    allocated_memory: usize,\n    freed_memory: usize,\n    resident_memory: usize,\n    insert_stats: Vec<InsertStats>,\n    get_stats: Vec<GetStats>,\n    remove_stats: Vec<RemoveStats>,\n}\n\nimpl Stats {\n    fn print_report(&self) {\n        println!(\n            \"bytes on disk after inserts: {}\",\n            self.post_insert_disk_space.to_formatted_string(&Locale::en)\n        );\n        println!(\n            \"bytes on disk after removes: {}\",\n            self.post_remove_disk_space.to_formatted_string(&Locale::en)\n        );\n        println!(\n            \"bytes in memory: {}\",\n            self.resident_memory.to_formatted_string(&Locale::en)\n        );\n        for stats in &self.insert_stats {\n            println!(\n                \"{} threads {} inserts per second\",\n                stats.thread_count,\n                stats.inserts_per_second.to_formatted_string(&Locale::en)\n            );\n        }\n        for stats in &self.get_stats {\n            println!(\n                \"{} threads {} gets per second\",\n                stats.thread_count,\n                stats.gets_per_second.to_formatted_string(&Locale::en)\n            );\n        }\n        for stats in &self.remove_stats {\n            println!(\n                \"{} threads {} removes per second\",\n                stats.thread_count,\n                stats.removes_per_second.to_formatted_string(&Locale::en)\n            );\n        }\n    }\n}\n\nfn bench<D: Databench>() -> Stats {\n    let store = D::open();\n\n    let insert_stats = inserts(&store);\n\n    let before_flush = Instant::now();\n    store.flush_generic();\n    println!(\"final flush took {:?} for {}\", before_flush.elapsed(), D::NAME);\n\n    let post_insert_disk_space = du(D::PATH.as_ref()).unwrap();\n\n    let get_stats = gets(&store);\n\n    let remove_stats = removes(&store);\n\n    store.print_stats();\n\n    Stats {\n        post_insert_disk_space,\n        post_remove_disk_space: du(D::PATH.as_ref()).unwrap(),\n        allocated_memory: allocated(),\n        freed_memory: freed(),\n        resident_memory: resident(),\n        insert_stats,\n        get_stats,\n        remove_stats,\n    }\n}\n\nfn du(path: &Path) -> io::Result<u64> {\n    fn recurse(mut dir: fs::ReadDir) -> io::Result<u64> {\n        dir.try_fold(0, |acc, file| {\n            let file = file?;\n            let size = match file.metadata()? {\n                data if data.is_dir() => recurse(fs::read_dir(file.path())?)?,\n                data => data.len(),\n            };\n            Ok(acc + size)\n        })\n    }\n\n    recurse(fs::read_dir(path)?)\n}\n\nfn main() {\n    let _ = env_logger::try_init();\n\n    let new_stats = bench::<Db>();\n\n    println!(\n        \"raw data size: {}\",\n        (MAX_CONCURRENCY * N_WRITES_PER_THREAD * BYTES_PER_ITEM)\n            .to_formatted_string(&Locale::en)\n    );\n    println!(\"sled 1.0 space stats:\");\n    new_stats.print_report();\n\n    /*\n    let old_stats = bench::<old_sled::Db>();\n    dbg!(old_stats);\n\n    let new_sled_vs_old_sled_storage_ratio =\n        new_stats.disk_space as f64 / old_stats.disk_space as f64;\n    let new_sled_vs_old_sled_allocated_memory_ratio =\n        new_stats.allocated_memory as f64 / old_stats.allocated_memory as f64;\n    let new_sled_vs_old_sled_freed_memory_ratio =\n        new_stats.freed_memory as f64 / old_stats.freed_memory as f64;\n    let new_sled_vs_old_sled_resident_memory_ratio =\n        new_stats.resident_memory as f64 / old_stats.resident_memory as f64;\n\n    dbg!(new_sled_vs_old_sled_storage_ratio);\n    dbg!(new_sled_vs_old_sled_allocated_memory_ratio);\n    dbg!(new_sled_vs_old_sled_freed_memory_ratio);\n    dbg!(new_sled_vs_old_sled_resident_memory_ratio);\n\n    let rocksdb_stats = bench::<Arc<rocksdb::DB>>();\n\n    bench::<Lmdb>();\n\n    bench::<Sqlite>();\n    */\n\n    /*\n    let new_sled_vs_rocksdb_storage_ratio =\n        new_stats.disk_space as f64 / rocksdb_stats.disk_space as f64;\n    let new_sled_vs_rocksdb_allocated_memory_ratio =\n        new_stats.allocated_memory as f64 / rocksdb_stats.allocated_memory as f64;\n    let new_sled_vs_rocksdb_freed_memory_ratio =\n        new_stats.freed_memory as f64 / rocksdb_stats.freed_memory as f64;\n    let new_sled_vs_rocksdb_resident_memory_ratio =\n        new_stats.resident_memory as f64 / rocksdb_stats.resident_memory as f64;\n\n    dbg!(new_sled_vs_rocksdb_storage_ratio);\n    dbg!(new_sled_vs_rocksdb_allocated_memory_ratio);\n    dbg!(new_sled_vs_rocksdb_freed_memory_ratio);\n    dbg!(new_sled_vs_rocksdb_resident_memory_ratio);\n    */\n\n    /*\n    let scan = Instant::now();\n    let count = stone.iter().count();\n    assert_eq!(count as u64, N_WRITES_PER_THREAD);\n    let scan_elapsed = scan.elapsed();\n    println!(\n        \"{} scanned items/s, total {:?}\",\n        (N_WRITES_PER_THREAD * 1_000_000) / u64::try_from(scan_elapsed.as_micros().max(1)).unwrap_or(u64::MAX),\n        scan_elapsed\n    );\n    */\n\n    /*\n    let scan_rev = Instant::now();\n    let count = stone.range(..).rev().count();\n    assert_eq!(count as u64, N_WRITES_PER_THREAD);\n    let scan_rev_elapsed = scan_rev.elapsed();\n    println!(\n        \"{} reverse-scanned items/s, total {:?}\",\n        (N_WRITES_PER_THREAD * 1_000_000) / u64::try_from(scan_rev_elapsed.as_micros().max(1)).unwrap_or(u64::MAX),\n        scan_rev_elapsed\n    );\n    */\n}\n"
  },
  {
    "path": "fuzz/.gitignore",
    "content": "target\ncorpus\nartifacts\n"
  },
  {
    "path": "fuzz/Cargo.toml",
    "content": "[package]\nname = \"bloodstone-fuzz\"\nversion = \"0.0.0\"\nauthors = [\"Automatically generated\"]\npublish = false\nedition = \"2018\"\n\n[package.metadata]\ncargo-fuzz = true\n\n[dependencies.libfuzzer-sys]\nversion = \"0.4.0\"\nfeatures = [\"arbitrary-derive\"]\n\n[dependencies]\narbitrary = { version = \"1.0.3\", features = [\"derive\"] }\ntempfile = \"3.5.0\"\n\n[dependencies.sled]\npath = \"..\"\nfeatures = []\n\n# Prevent this from interfering with workspaces\n[workspace]\nmembers = [\".\"]\n\n[[bin]]\nname = \"fuzz_model\"\npath = \"fuzz_targets/fuzz_model.rs\"\ntest = false\ndoc = false\n"
  },
  {
    "path": "fuzz/fuzz_targets/fuzz_model.rs",
    "content": "#![no_main]\n#[macro_use]\nextern crate libfuzzer_sys;\nextern crate arbitrary;\nextern crate sled;\n\nuse arbitrary::Arbitrary;\n\nuse sled::{Config, Db as SledDb, InlineArray};\n\ntype Db = SledDb<3>;\n\nconst KEYSPACE: u64 = 128;\n\n#[derive(Debug)]\nenum Op {\n    Get { key: InlineArray },\n    Insert { key: InlineArray, value: InlineArray },\n    Reboot,\n    Remove { key: InlineArray },\n    Cas { key: InlineArray, old: Option<InlineArray>, new: Option<InlineArray> },\n    Range { start: InlineArray, end: InlineArray },\n}\n\nfn keygen(\n    u: &mut arbitrary::Unstructured<'_>,\n) -> arbitrary::Result<InlineArray> {\n    let key_i: u64 = u.int_in_range(0..=KEYSPACE)?;\n    Ok(key_i.to_be_bytes().as_ref().into())\n}\n\nimpl<'a> Arbitrary<'a> for Op {\n    fn arbitrary(\n        u: &mut arbitrary::Unstructured<'a>,\n    ) -> arbitrary::Result<Self> {\n        Ok(if u.ratio(1, 2)? {\n            Op::Insert { key: keygen(u)?, value: keygen(u)? }\n        } else if u.ratio(1, 2)? {\n            Op::Get { key: keygen(u)? }\n        } else if u.ratio(1, 2)? {\n            Op::Reboot\n        } else if u.ratio(1, 2)? {\n            Op::Remove { key: keygen(u)? }\n        } else if u.ratio(1, 2)? {\n            Op::Cas {\n                key: keygen(u)?,\n                old: if u.ratio(1, 2)? { Some(keygen(u)?) } else { None },\n                new: if u.ratio(1, 2)? { Some(keygen(u)?) } else { None },\n            }\n        } else {\n            let start = u.int_in_range(0..=KEYSPACE)?;\n            let end = (start + 1).max(u.int_in_range(0..=KEYSPACE)?);\n\n            Op::Range {\n                start: start.to_be_bytes().as_ref().into(),\n                end: end.to_be_bytes().as_ref().into(),\n            }\n        })\n    }\n}\n\nfuzz_target!(|ops: Vec<Op>| {\n    let tmp_dir = tempfile::TempDir::new().unwrap();\n    let tmp_path = tmp_dir.path().to_owned();\n    let config = Config::new().path(tmp_path);\n\n    let mut tree: Db = config.open().unwrap();\n    let mut model = std::collections::BTreeMap::new();\n\n    for (_i, op) in ops.into_iter().enumerate() {\n        match op {\n            Op::Insert { key, value } => {\n                assert_eq!(\n                    tree.insert(key.clone(), value.clone()).unwrap(),\n                    model.insert(key, value)\n                );\n            }\n            Op::Get { key } => {\n                assert_eq!(tree.get(&key).unwrap(), model.get(&key).cloned());\n            }\n            Op::Reboot => {\n                drop(tree);\n                tree = config.open().unwrap();\n            }\n            Op::Remove { key } => {\n                assert_eq!(tree.remove(&key).unwrap(), model.remove(&key));\n            }\n            Op::Range { start, end } => {\n                let mut model_iter =\n                    model.range::<InlineArray, _>(&start..&end);\n                let mut tree_iter = tree.range(start..end);\n\n                for (k1, v1) in &mut model_iter {\n                    let (k2, v2) = tree_iter\n                        .next()\n                        .expect(\"None returned from iter when Some expected\")\n                        .expect(\"IO issue encountered\");\n                    assert_eq!((k1, v1), (&k2, &v2));\n                }\n\n                assert!(tree_iter.next().is_none());\n            }\n            Op::Cas { key, old, new } => {\n                let succ = if old == model.get(&key).cloned() {\n                    if let Some(n) = &new {\n                        model.insert(key.clone(), n.clone());\n                    } else {\n                        model.remove(&key);\n                    }\n                    true\n                } else {\n                    false\n                };\n\n                let res = tree\n                    .compare_and_swap(key, old.as_ref(), new)\n                    .expect(\"hit IO error\");\n\n                if succ {\n                    assert!(res.is_ok());\n                } else {\n                    assert!(res.is_err());\n                }\n            }\n        };\n\n        for (key, value) in &model {\n            assert_eq!(tree.get(key).unwrap().unwrap(), value);\n        }\n\n        for kv_res in &tree {\n            let (key, value) = kv_res.unwrap();\n            assert_eq!(model.get(&key), Some(&value));\n        }\n    }\n\n    let mut model_iter = model.iter();\n    let mut tree_iter = tree.iter();\n\n    for (k1, v1) in &mut model_iter {\n        let (k2, v2) = tree_iter.next().unwrap().unwrap();\n        assert_eq!((k1, v1), (&k2, &v2));\n    }\n\n    assert!(tree_iter.next().is_none());\n});\n"
  },
  {
    "path": "scripts/cgtest.sh",
    "content": "#!/bin/sh\nset -e\n\ncgdelete memory:sledTest || true\ncgcreate -g memory:sledTest\necho 100M > /sys/fs/cgroup/memory/sledTest/memory.limit_in_bytes\n\nsu $SUDO_USER -c 'cargo build --release --features=testing'\n\nfor test in target/release/deps/test*; do\n  if [[ -x $test ]]\n  then\n    echo running test: $test\n    cgexec -g memory:sledTest $test --test-threads=1\n    rm $test\n  fi\ndone\n"
  },
  {
    "path": "scripts/cross_compile.sh",
    "content": "#!/bin/sh\nset -e\n\n# checks sled's compatibility using several targets\n\ntargets=\"wasm32-wasi wasm32-unknown-unknown aarch64-fuchsia aarch64-linux-android \\\n         i686-linux-android i686-unknown-linux-gnu \\\n         x86_64-linux-android x86_64-fuchsia \\\n         mips-unknown-linux-musl aarch64-apple-ios\"\n\nrustup update --no-self-update\n\nRUSTFLAGS=\"--cfg miri\" cargo check\n\nrustup toolchain install 1.62 --no-self-update\ncargo clean\nrm Cargo.lock\ncargo +1.62 check\n\nfor target in $targets; do\n  echo \"setting up $target...\"\n  rustup target add $target\n  echo \"checking $target...\"\n  cargo check --target $target\ndone\n\n"
  },
  {
    "path": "scripts/execution_explorer.py",
    "content": "#!/usr/bin/gdb --command\n\n\"\"\"\na simple python GDB script for running multithreaded\nprograms in a way that is \"deterministic enough\"\nto tease out and replay interesting bugs.\n\nTyler Neely 25 Sept 2017\nt@jujit.su\n\nreferences:\n    https://sourceware.org/gdb/onlinedocs/gdb/All_002dStop-Mode.html\n    https://sourceware.org/gdb/onlinedocs/gdb/Non_002dStop-Mode.html\n    https://sourceware.org/gdb/onlinedocs/gdb/Threads-In-Python.html\n    https://sourceware.org/gdb/onlinedocs/gdb/Events-In-Python.html\n    https://blog.0x972.info/index.php?tag=gdb.py\n\"\"\"\n\nimport gdb\nimport random\n\n###############################################################################\n#                                   config                                    #\n###############################################################################\n# set this to a number for reproducing results or None to explore randomly\nseed = 156112673742  # None  # 951931004895\n\n# set this to the number of valid threads in the program\n# {2, 3} assumes a main thread that waits on 2 workers.\n# {1, ... N} assumes all of the first N threads are to be explored\nthreads_whitelist = {2, 3}\n\n# set this to the file of the binary to explore\nfilename = \"target/debug/binary\"\n\n# set this to the place the threads should rendezvous before exploring\nentrypoint = \"src/main.rs:8\"\n\n# set this to after the threads are done\nexitpoint = \"src/main.rs:12\"\n\n# invariant unreachable points that should never be accessed\nunreachable = [\n        \"panic_unwind::imp::panic\"\n        ]\n\n# set this to the locations you want to test interleavings for\ninteresting = [\n        \"src/main.rs:8\",\n        \"src/main.rs:9\"\n        ]\n\n# uncomment this to output the specific commands issued to gdb\ngdb.execute(\"set trace-commands on\")\n\n###############################################################################\n###############################################################################\n\n\nclass UnreachableBreakpoint(gdb.Breakpoint):\n    pass\n\n\nclass DoneBreakpoint(gdb.Breakpoint):\n    pass\n\n\nclass InterestingBreakpoint(gdb.Breakpoint):\n    pass\n\n\nclass DeterministicExecutor:\n    def __init__(self, seed=None):\n        if seed:\n            print(\"seeding with\", seed)\n            self.seed = seed\n            random.seed(seed)\n        else:\n            # pick a random new seed if not provided with one\n            self.reseed()\n\n        gdb.execute(\"file \" + filename)\n\n        # non-stop is necessary to provide thread-specific\n        # information when breakpoints are hit.\n        gdb.execute(\"set non-stop on\")\n        gdb.execute(\"set confirm off\")\n\n        self.ready = set()\n        self.finished = set()\n\n    def reseed(self):\n        random.seed()\n        self.seed = random.randrange(1e12)\n        print(\"reseeding with\", self.seed)\n        random.seed(self.seed)\n\n    def restart(self):\n        # reset inner state\n        self.ready = set()\n        self.finished = set()\n\n        # disconnect callbacks\n        gdb.events.stop.disconnect(self.scheduler_callback)\n        gdb.events.exited.disconnect(self.exit_callback)\n\n        # nuke all breakpoints\n        gdb.execute(\"d\")\n\n        # end execution\n        gdb.execute(\"k\")\n\n        # pick new seed\n        self.reseed()\n\n        self.run()\n\n    def rendezvous_callback(self, event):\n        try:\n            self.ready.add(event.inferior_thread.num)\n            if len(self.ready) == len(threads_whitelist):\n                self.run_schedule()\n        except Exception as e:\n            # this will be thrown if breakpoint is not a part of event,\n            # like when the event was stopped for another reason.\n            print(e)\n\n    def run(self):\n        gdb.execute(\"b \" + entrypoint)\n\n        gdb.events.stop.connect(self.rendezvous_callback)\n        gdb.events.exited.connect(self.exit_callback)\n\n        gdb.execute(\"r\")\n\n    def run_schedule(self):\n        print(\"running schedule\")\n        gdb.execute(\"d\")\n        gdb.events.stop.disconnect(self.rendezvous_callback)\n        gdb.events.stop.connect(self.scheduler_callback)\n\n        for bp in interesting:\n            InterestingBreakpoint(bp)\n\n        for bp in unreachable:\n            UnreachableBreakpoint(bp)\n\n        DoneBreakpoint(exitpoint)\n\n        self.pick()\n\n    def pick(self):\n        threads = self.runnable_threads()\n        if not threads:\n            print(\"restarting execution after running out of valid threads\")\n            self.restart()\n            return\n\n        thread = random.choice(threads)\n\n        gdb.execute(\"t \" + str(thread.num))\n        gdb.execute(\"c\")\n\n    def scheduler_callback(self, event):\n        if not isinstance(event, gdb.BreakpointEvent):\n            print(\"WTF sched callback got\", event.__dict__)\n            return\n\n        if isinstance(event.breakpoint, DoneBreakpoint):\n            self.finished.add(event.inferior_thread.num)\n        elif isinstance(event.breakpoint, UnreachableBreakpoint):\n            print(\"!\" * 80)\n            print(\"unreachable breakpoint triggered with seed\", self.seed)\n            print(\"!\" * 80)\n            gdb.events.exited.disconnect(self.exit_callback)\n            gdb.execute(\"q\")\n        else:\n            print(\"thread\", event.inferior_thread.num,\n                  \"hit breakpoint at\", event.breakpoint.location)\n\n        self.pick()\n\n    def runnable_threads(self):\n        threads = gdb.selected_inferior().threads()\n\n        def f(it):\n            return (it.is_valid() and not\n                    it.is_exited() and\n                    it.num in threads_whitelist and\n                    it.num not in self.finished)\n\n        good_threads = [it for it in threads if f(it)]\n        good_threads.sort(key=lambda it: it.num)\n\n        return good_threads\n\n    def exit_callback(self, event):\n        try:\n            if event.exit_code != 0:\n                print(\"!\" * 80)\n                print(\"interesting exit with seed\", self.seed)\n                print(\"!\" * 80)\n            else:\n                print(\"happy exit\")\n                self.restart()\n\n            gdb.execute(\"q\")\n        except Exception as e:\n            pass\n\nde = DeterministicExecutor(seed)\nde.run()\n"
  },
  {
    "path": "scripts/instructions",
    "content": "#!/bin/sh\n# counts instructions for a standard workload\nset -e\n\nOUTFILE=\"cachegrind.stress2.`git describe --always --dirty`-`date +%s`\"\n\nrm -rf default.sled || true\n\ncargo build \\\n  --bin=stress2 \\\n  --release\n\n\n# --tool=callgrind --dump-instr=yes --collect-jumps=yes --simulate-cache=yes \\\n# --callgrind-out-file=\"$OUTFILE\" \\\n\nvalgrind \\\n  --tool=cachegrind \\\n  --cachegrind-out-file=\"$OUTFILE\" \\\n  ./target/release/stress2 --total-ops=50000 --set-prop=1000000000000 --threads=1\n\nLAST=`ls -t cachegrind.stress2.* | sed -n 2p`\n\necho \"comparing $LAST with new $OUTFILE\"\n\necho \"--------------------------------------------------------------------------------\"\necho \"change since last run:\"\necho \"         Ir   I1mr  ILmr          Dr    D1mr    DLmr          Dw    D1mw    DLmw\"\necho \"--------------------------------------------------------------------------------\"\ncg_diff $LAST $OUTFILE | tail -1\n"
  },
  {
    "path": "scripts/sanitizers.sh",
    "content": "#!/bin/bash\nset -eo pipefail\n\npushd benchmarks/stress2\n\nrustup toolchain install nightly\nrustup toolchain install nightly --component rust-src\nrustup update\n\nexport SLED_LOCK_FREE_DELAY_INTENSITY=2000\n\necho \"msan\"\ncargo clean\nexport RUSTFLAGS=\"-Zsanitizer=memory -Zsanitizer-memory-track-origins\"\ncargo +nightly build -Zbuild-std --target x86_64-unknown-linux-gnu\nsudo rm -rf default.sled\nsudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=30 --set-prop=100000000 --val-len=1000 --entries=100 --threads=100\nsudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=30 --entries=100\nsudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=30\nunset MSAN_OPTIONS\n\necho \"asan\"\ncargo clean\nexport RUSTFLAGS=\"-Z sanitizer=address\"\nexport ASAN_OPTIONS=\"detect_odr_violation=0\"\ncargo +nightly build --features=lock_free_delays --target x86_64-unknown-linux-gnu\nsudo rm -rf default.sled\nsudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=60\nsudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=6\nunset ASAN_OPTIONS\n\necho \"lsan\"\ncargo clean\nexport RUSTFLAGS=\"-Z sanitizer=leak\"\ncargo +nightly build --features=lock_free_delays --target x86_64-unknown-linux-gnu\nsudo rm -rf default.sled\nsudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=60\nsudo target/x86_64-unknown-linux-gnu/debug/stress2 --duration=6\n\necho \"tsan\"\ncargo clean\nexport RUSTFLAGS=\"-Z sanitizer=thread\"\nexport TSAN_OPTIONS=suppressions=../../tsan_suppressions.txt\nsudo rm -rf default.sled\ncargo +nightly run --features=lock_free_delays --target x86_64-unknown-linux-gnu -- --duration=60\ncargo +nightly run --features=lock_free_delays --target x86_64-unknown-linux-gnu -- --duration=6\nunset RUSTFLAGS\nunset TSAN_OPTIONS\n"
  },
  {
    "path": "scripts/shufnice.sh",
    "content": "#!/bin/sh\n\nwhile true; do\n  PID=`pgrep $1`\n  TIDS=`ls /proc/$PID/task`\n  TID=`echo $TIDS |  tr \" \" \"\\n\" | shuf -n1`\n  NICE=$((`shuf -i 0-39 -n 1` - 20))\n  echo \"renicing $TID to $NICE\"\n  renice -n $NICE -p $TID\ndone\n"
  },
  {
    "path": "scripts/ubuntu_bench",
    "content": "#!/bin/sh\n\nsudo apt-get update\nsudo apt-get install htop dstat build-essential linux-tools-common linux-tools-generic linux-tools-`uname -r`\ncurl https://sh.rustup.rs -sSf | sh\nsource $HOME/.cargo/env\n\ncargo install flamegraph\n\ngit clone https://github.com/spacejam/sled.git\ncd sled\n\ncores=$(grep -c ^processor /proc/cpuinfo)\nwriters=(($cores / 5 + 1 ))\nreaders=$(( ($cores / 5 + 1) * 4 ))\n\ncargo build --release --bin=stress2 --features=stress\n\n# we use sudo here to get access to symbols\npushd benchmarks/stress2\ncargo flamegraph --release -- --get=$readers --set=$writers\n"
  },
  {
    "path": "src/alloc.rs",
    "content": "#[cfg(any(\n    feature = \"testing-shred-allocator\",\n    feature = \"testing-count-allocator\"\n))]\npub use alloc::*;\n\n// the memshred feature causes all allocated and deallocated\n// memory to be set to a specific non-zero value of 0xa1 for\n// uninitialized allocations and 0xde for deallocated memory,\n// in the hope that it will cause memory errors to surface\n// more quickly.\n\n#[cfg(feature = \"testing-shred-allocator\")]\nmod alloc {\n    use std::alloc::{Layout, System};\n\n    #[global_allocator]\n    static ALLOCATOR: ShredAllocator = ShredAllocator;\n\n    #[derive(Default, Debug, Clone, Copy)]\n    struct ShredAllocator;\n\n    unsafe impl std::alloc::GlobalAlloc for ShredAllocator {\n        unsafe fn alloc(&self, layout: Layout) -> *mut u8 {\n            let ret = System.alloc(layout);\n            assert_ne!(ret, std::ptr::null_mut());\n            std::ptr::write_bytes(ret, 0xa1, layout.size());\n            ret\n        }\n\n        unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {\n            std::ptr::write_bytes(ptr, 0xde, layout.size());\n            System.dealloc(ptr, layout)\n        }\n    }\n}\n\n#[cfg(feature = \"testing-count-allocator\")]\nmod alloc {\n    use std::alloc::{Layout, System};\n\n    #[global_allocator]\n    static ALLOCATOR: CountingAllocator = CountingAllocator;\n\n    static ALLOCATED: AtomicUsize = AtomicUsize::new(0);\n    static FREED: AtomicUsize = AtomicUsize::new(0);\n    static RESIDENT: AtomicUsize = AtomicUsize::new(0);\n\n    fn allocated() -> usize {\n        ALLOCATED.swap(0, Ordering::Relaxed)\n    }\n\n    fn freed() -> usize {\n        FREED.swap(0, Ordering::Relaxed)\n    }\n\n    fn resident() -> usize {\n        RESIDENT.load(Ordering::Relaxed)\n    }\n\n    #[derive(Default, Debug, Clone, Copy)]\n    struct CountingAllocator;\n\n    unsafe impl std::alloc::GlobalAlloc for CountingAllocator {\n        unsafe fn alloc(&self, layout: Layout) -> *mut u8 {\n            let ret = System.alloc(layout);\n            assert_ne!(ret, std::ptr::null_mut());\n            ALLOCATED.fetch_add(layout.size(), Ordering::Relaxed);\n            RESIDENT.fetch_add(layout.size(), Ordering::Relaxed);\n            std::ptr::write_bytes(ret, 0xa1, layout.size());\n            ret\n        }\n\n        unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {\n            std::ptr::write_bytes(ptr, 0xde, layout.size());\n            FREED.fetch_add(layout.size(), Ordering::Relaxed);\n            RESIDENT.fetch_sub(layout.size(), Ordering::Relaxed);\n            System.dealloc(ptr, layout)\n        }\n    }\n}\n"
  },
  {
    "path": "src/block_checker.rs",
    "content": "use std::collections::BTreeMap;\nuse std::panic::Location;\nuse std::sync::atomic::{AtomicU64, Ordering};\nuse std::sync::{LazyLock, Mutex};\n\nstatic COUNTER: AtomicU64 = AtomicU64::new(0);\nstatic CHECK_INS: LazyLock<BlockChecker> = LazyLock::new(|| {\n    std::thread::spawn(move || {\n        let mut last_top_10 = Default::default();\n        loop {\n            std::thread::sleep(std::time::Duration::from_secs(5));\n            last_top_10 = CHECK_INS.report(last_top_10);\n        }\n    });\n\n    BlockChecker::default()\n});\n\ntype LocationMap = BTreeMap<u64, &'static Location<'static>>;\n\n#[derive(Default)]\npub(crate) struct BlockChecker {\n    state: Mutex<LocationMap>,\n}\n\nimpl BlockChecker {\n    fn report(&self, last_top_10: LocationMap) -> LocationMap {\n        let state = self.state.lock().unwrap();\n        println!(\"top 10 longest blocking sections:\");\n\n        let top_10: LocationMap =\n            state.iter().take(10).map(|(k, v)| (*k, *v)).collect();\n\n        for (id, location) in &top_10 {\n            if last_top_10.contains_key(id) {\n                println!(\"id: {}, location: {:?}\", id, location);\n            }\n        }\n\n        top_10\n    }\n\n    fn check_in(&self, location: &'static Location) -> BlockGuard {\n        let next_id = COUNTER.fetch_add(1, Ordering::Relaxed);\n        let mut state = self.state.lock().unwrap();\n        state.insert(next_id, location);\n        BlockGuard { id: next_id }\n    }\n\n    fn check_out(&self, id: u64) {\n        let mut state = self.state.lock().unwrap();\n        state.remove(&id);\n    }\n}\n\npub(crate) struct BlockGuard {\n    id: u64,\n}\n\nimpl Drop for BlockGuard {\n    fn drop(&mut self) {\n        CHECK_INS.check_out(self.id)\n    }\n}\n\n#[track_caller]\npub(crate) fn track_blocks() -> BlockGuard {\n    let caller = Location::caller();\n    CHECK_INS.check_in(caller)\n}\n"
  },
  {
    "path": "src/config.rs",
    "content": "use std::io;\nuse std::path::{Path, PathBuf};\nuse std::sync::Arc;\n\nuse fault_injection::{annotate, fallible};\nuse tempdir::TempDir;\n\nuse crate::Db;\n\nmacro_rules! builder {\n    ($(($name:ident, $t:ty, $desc:expr)),*) => {\n        $(\n            #[doc=$desc]\n            pub fn $name(mut self, to: $t) -> Self {\n                self.$name = to;\n                self\n            }\n        )*\n    }\n}\n\n#[derive(Debug, Clone)]\npub struct Config {\n    /// The base directory for storing the database.\n    pub path: PathBuf,\n    /// Cache size in **bytes**. Default is 512mb.\n    pub cache_capacity_bytes: usize,\n    /// The percentage of the cache that is dedicated to the\n    /// scan-resistant entry cache.\n    pub entry_cache_percent: u8,\n    /// Start a background thread that flushes data to disk\n    /// every few milliseconds. Defaults to every 200ms.\n    pub flush_every_ms: Option<usize>,\n    /// The zstd compression level to use when writing data to disk. Defaults to 3.\n    pub zstd_compression_level: i32,\n    /// This is only set to `Some` for objects created via\n    /// `Config::tmp`, and will remove the storage directory\n    /// when the final Arc drops.\n    pub tempdir_deleter: Option<Arc<TempDir>>,\n    /// A float between 0.0 and 1.0 that controls how much fragmentation can\n    /// exist in a file before GC attempts to recompact it.\n    pub target_heap_file_fill_ratio: f32,\n    /// Values larger than this configurable will be stored as separate blob\n    pub max_inline_value_threshold: usize,\n}\n\nimpl Default for Config {\n    fn default() -> Config {\n        Config {\n            path: \"bloodstone.default\".into(),\n            flush_every_ms: Some(200),\n            cache_capacity_bytes: 512 * 1024 * 1024,\n            entry_cache_percent: 20,\n            zstd_compression_level: 3,\n            tempdir_deleter: None,\n            target_heap_file_fill_ratio: 0.9,\n            max_inline_value_threshold: 4096,\n        }\n    }\n}\n\nimpl Config {\n    /// Returns a default `Config`\n    pub fn new() -> Config {\n        Config::default()\n    }\n\n    /// Returns a config with the `path` initialized to a system\n    /// temporary directory that will be deleted when this `Config`\n    /// is dropped.\n    pub fn tmp() -> io::Result<Config> {\n        let tempdir = fallible!(tempdir::TempDir::new(\"sled_tmp\"));\n\n        Ok(Config {\n            path: tempdir.path().into(),\n            tempdir_deleter: Some(Arc::new(tempdir)),\n            ..Config::default()\n        })\n    }\n\n    /// Set the path of the database (builder).\n    pub fn path<P: AsRef<Path>>(mut self, path: P) -> Config {\n        self.path = path.as_ref().to_path_buf();\n        self\n    }\n\n    builder!(\n        (flush_every_ms, Option<usize>, \"Start a background thread that flushes data to disk every few milliseconds. Defaults to every 200ms.\"),\n        (cache_capacity_bytes, usize, \"Cache size in **bytes**. Default is 512mb.\"),\n        (entry_cache_percent, u8, \"The percentage of the cache that is dedicated to the scan-resistant entry cache.\"),\n        (zstd_compression_level, i32, \"The zstd compression level to use when writing data to disk. Defaults to 3.\"),\n        (target_heap_file_fill_ratio, f32, \"A float between 0.0 and 1.0 that controls how much fragmentation can exist in a file before GC attempts to recompact it.\"),\n        (max_inline_value_threshold, usize, \"Values larger than this configurable will be stored as separate blob\")\n    );\n\n    pub fn open<const LEAF_FANOUT: usize>(\n        &self,\n    ) -> io::Result<Db<LEAF_FANOUT>> {\n        if LEAF_FANOUT < 3 {\n            return Err(annotate!(io::Error::new(\n                io::ErrorKind::Unsupported,\n                \"Db's LEAF_FANOUT const generic must be 3 or greater.\"\n            )));\n        }\n        Db::open_with_config(self)\n    }\n}\n"
  },
  {
    "path": "src/db.rs",
    "content": "use std::collections::HashMap;\nuse std::fmt;\nuse std::io;\nuse std::sync::{Arc, mpsc};\nuse std::time::{Duration, Instant};\n\nuse parking_lot::Mutex;\n\nuse crate::*;\n\n/// sled 1.0 alpha :)\n///\n/// One of the main differences between this and sled 0.34 is that\n/// `Db` and `Tree` now have a `LEAF_FANOUT` const generic parameter.\n/// This parameter is an interesting single-knob performance tunable\n/// that allows users to traverse the performance-vs-efficiency\n/// trade-off spectrum. The default value of `1024` causes keys and\n/// values to be more efficiently compressed when stored on disk,\n/// but for larger-than-memory random workloads it may be advantageous\n/// to lower `LEAF_FANOUT` to between `16` to `256`, depending on your\n/// efficiency requirements. A lower value will also cause contention\n/// to be reduced for frequently accessed data. This value cannot be\n/// changed after creating the database.\n///\n/// As an alpha release, please do not expect this to be safe for\n/// business-critical use cases. However, if you would like this to\n/// serve your business-critical use cases over time, please give it\n/// a shot in a low-risk non-production environment and report any\n/// issues you encounter in a github issue.\n///\n/// Note that `Db` implements `Deref` for the default `Tree` (sled's\n/// version of namespaces / keyspaces / buckets), but you can create\n/// and use others using `Db::open_tree`.\n#[derive(Clone)]\npub struct Db<const LEAF_FANOUT: usize = 1024> {\n    config: Config,\n    _shutdown_dropper: Arc<ShutdownDropper<LEAF_FANOUT>>,\n    cache: ObjectCache<LEAF_FANOUT>,\n    trees: Arc<Mutex<HashMap<CollectionId, Tree<LEAF_FANOUT>>>>,\n    collection_id_allocator: Arc<Allocator>,\n    collection_name_mapping: Tree<LEAF_FANOUT>,\n    default_tree: Tree<LEAF_FANOUT>,\n    was_recovered: bool,\n}\n\nimpl<const LEAF_FANOUT: usize> std::ops::Deref for Db<LEAF_FANOUT> {\n    type Target = Tree<LEAF_FANOUT>;\n    fn deref(&self) -> &Tree<LEAF_FANOUT> {\n        &self.default_tree\n    }\n}\n\nimpl<const LEAF_FANOUT: usize> IntoIterator for &Db<LEAF_FANOUT> {\n    type Item = io::Result<(InlineArray, InlineArray)>;\n    type IntoIter = crate::Iter<LEAF_FANOUT>;\n\n    fn into_iter(self) -> Self::IntoIter {\n        self.iter()\n    }\n}\n\nimpl<const LEAF_FANOUT: usize> fmt::Debug for Db<LEAF_FANOUT> {\n    fn fmt(&self, w: &mut fmt::Formatter<'_>) -> fmt::Result {\n        let alternate = w.alternate();\n\n        let mut debug_struct = w.debug_struct(&format!(\"Db<{}>\", LEAF_FANOUT));\n\n        if alternate {\n            debug_struct\n                .field(\"global_error\", &self.check_error())\n                .field(\n                    \"data\",\n                    &format!(\"{:?}\", self.iter().collect::<Vec<_>>()),\n                )\n                .finish()\n        } else {\n            debug_struct.field(\"global_error\", &self.check_error()).finish()\n        }\n    }\n}\n\nfn flusher<const LEAF_FANOUT: usize>(\n    cache: ObjectCache<LEAF_FANOUT>,\n    shutdown_signal: mpsc::Receiver<mpsc::Sender<()>>,\n    flush_every_ms: usize,\n) {\n    let interval = Duration::from_millis(flush_every_ms as _);\n    let mut last_flush_duration = Duration::default();\n\n    let flush = || {\n        let flush_res_res = std::panic::catch_unwind(|| cache.flush());\n        match flush_res_res {\n            Ok(Ok(_)) => {\n                // don't abort.\n                return;\n            }\n            Ok(Err(flush_failure)) => {\n                log::error!(\n                    \"Db flusher encountered error while flushing: {:?}\",\n                    flush_failure\n                );\n                cache.set_error(&flush_failure);\n            }\n            Err(panicked) => {\n                log::error!(\n                    \"Db flusher panicked while flushing: {:?}\",\n                    panicked\n                );\n                cache.set_error(&io::Error::other(\n                    \"Db flusher panicked while flushing\".to_string(),\n                ));\n            }\n        }\n        std::process::abort();\n    };\n\n    loop {\n        let recv_timeout = interval\n            .saturating_sub(last_flush_duration)\n            .max(Duration::from_millis(1));\n        if let Ok(shutdown_sender) = shutdown_signal.recv_timeout(recv_timeout)\n        {\n            flush();\n\n            // this is probably unnecessary but it will avoid issues\n            // if egregious bugs get introduced that trigger it\n            cache.set_error(&io::Error::other(\n                \"system has been shut down\".to_string(),\n            ));\n\n            assert!(cache.is_clean());\n\n            drop(cache);\n\n            if let Err(e) = shutdown_sender.send(()) {\n                log::error!(\n                    \"Db flusher could not ack shutdown to requestor: {e:?}\"\n                );\n            }\n            log::debug!(\n                \"flush thread terminating after signalling to requestor\"\n            );\n            return;\n        }\n\n        let before_flush = Instant::now();\n\n        flush();\n\n        last_flush_duration = before_flush.elapsed();\n    }\n}\n\nimpl<const LEAF_FANOUT: usize> Drop for Db<LEAF_FANOUT> {\n    fn drop(&mut self) {\n        if self.config.flush_every_ms.is_none() {\n            if let Err(e) = self.flush() {\n                log::error!(\"failed to flush Db on Drop: {e:?}\");\n            }\n        } else {\n            // otherwise, it is expected that the flusher thread will\n            // flush while shutting down the final Db/Tree instance\n        }\n    }\n}\n\nimpl<const LEAF_FANOUT: usize> Db<LEAF_FANOUT> {\n    #[cfg(feature = \"for-internal-testing-only\")]\n    fn validate(&self) -> io::Result<()> {\n        // for each tree, iterate over index, read node and assert low key matches\n        // and assert first time we've ever seen node ID\n\n        let mut ever_seen = std::collections::HashSet::new();\n        let before = std::time::Instant::now();\n\n        #[cfg(feature = \"for-internal-testing-only\")]\n        let _b0 = crate::block_checker::track_blocks();\n\n        for (_cid, tree) in self.trees.lock().iter() {\n            let mut hi_none_count = 0;\n            let mut last_hi = None;\n            for (low, node) in tree.index.iter() {\n                // ensure we haven't reused the object_id across Trees\n                assert!(ever_seen.insert(node.object_id));\n\n                let (read_low, node_mu, read_node) =\n                    tree.page_in(&low, self.cache.current_flush_epoch())?;\n\n                assert_eq!(read_node.object_id, node.object_id);\n                assert_eq!(node_mu.leaf.as_ref().unwrap().lo, low);\n                assert_eq!(read_low, low);\n\n                if let Some(hi) = &last_hi {\n                    assert_eq!(hi, &node_mu.leaf.as_ref().unwrap().lo);\n                }\n\n                if let Some(hi) = &node_mu.leaf.as_ref().unwrap().hi {\n                    last_hi = Some(hi.clone());\n                } else {\n                    assert_eq!(hi_none_count, 0);\n                    hi_none_count += 1;\n                }\n            }\n            // each tree should have exactly one leaf with no max hi key\n            assert_eq!(hi_none_count, 1);\n        }\n\n        log::debug!(\n            \"{} leaves looking good after {} micros\",\n            ever_seen.len(),\n            before.elapsed().as_micros()\n        );\n\n        Ok(())\n    }\n\n    pub fn stats(&self) -> Stats {\n        Stats { cache: self.cache.stats() }\n    }\n\n    pub fn size_on_disk(&self) -> io::Result<u64> {\n        use std::fs::read_dir;\n\n        fn recurse(mut dir: std::fs::ReadDir) -> io::Result<u64> {\n            dir.try_fold(0, |acc, file| {\n                let file = file?;\n                let size = match file.metadata()? {\n                    data if data.is_dir() => recurse(read_dir(file.path())?)?,\n                    data => data.len(),\n                };\n                Ok(acc + size)\n            })\n        }\n\n        recurse(read_dir(&self.cache.config.path)?)\n    }\n\n    /// Returns `true` if the database was\n    /// recovered from a previous process.\n    /// Note that database state is only\n    /// guaranteed to be present up to the\n    /// last call to `flush`! Otherwise state\n    /// is synced to disk periodically if the\n    /// `Config.sync_every_ms` configuration option\n    /// is set to `Some(number_of_ms_between_syncs)`\n    /// or if the IO buffer gets filled to\n    /// capacity before being rotated.\n    pub fn was_recovered(&self) -> bool {\n        self.was_recovered\n    }\n\n    pub fn open_with_config(config: &Config) -> io::Result<Db<LEAF_FANOUT>> {\n        let (shutdown_tx, shutdown_rx) = mpsc::channel();\n\n        let (cache, indices, was_recovered) = ObjectCache::recover(config)?;\n\n        let _shutdown_dropper = Arc::new(ShutdownDropper {\n            shutdown_sender: Mutex::new(shutdown_tx),\n            cache: Mutex::new(cache.clone()),\n        });\n\n        let mut allocated_collection_ids = fnv::FnvHashSet::default();\n\n        let mut trees: HashMap<CollectionId, Tree<LEAF_FANOUT>> = indices\n            .into_iter()\n            .map(|(collection_id, index)| {\n                assert!(\n                    allocated_collection_ids.insert(collection_id.0),\n                    \"allocated_collection_ids already contained {:?}\",\n                    collection_id\n                );\n                (\n                    collection_id,\n                    Tree::new(\n                        collection_id,\n                        cache.clone(),\n                        index,\n                        _shutdown_dropper.clone(),\n                    ),\n                )\n            })\n            .collect();\n\n        let collection_name_mapping =\n            trees.get(&NAME_MAPPING_COLLECTION_ID).unwrap().clone();\n\n        let default_tree = trees.get(&DEFAULT_COLLECTION_ID).unwrap().clone();\n\n        for kv_res in collection_name_mapping.iter() {\n            let (_collection_name, collection_id_buf) = kv_res.unwrap();\n            let collection_id = CollectionId(u64::from_le_bytes(\n                collection_id_buf.as_ref().try_into().unwrap(),\n            ));\n\n            if trees.contains_key(&collection_id) {\n                continue;\n            }\n\n            // need to initialize tree leaf for empty collection\n\n            assert!(\n                allocated_collection_ids.insert(collection_id.0),\n                \"allocated_collection_ids already contained {:?}\",\n                collection_id\n            );\n\n            let initial_low_key = InlineArray::default();\n\n            let empty_node = cache.allocate_default_node(collection_id);\n\n            let index = Index::default();\n\n            assert!(index.insert(initial_low_key, empty_node).is_none());\n\n            let tree = Tree::new(\n                collection_id,\n                cache.clone(),\n                index,\n                _shutdown_dropper.clone(),\n            );\n\n            trees.insert(collection_id, tree);\n        }\n\n        let collection_id_allocator =\n            Arc::new(Allocator::from_allocated(&allocated_collection_ids));\n\n        assert_eq!(collection_name_mapping.len()? + 2, trees.len());\n\n        let ret = Db {\n            config: config.clone(),\n            cache: cache.clone(),\n            default_tree,\n            collection_name_mapping,\n            collection_id_allocator,\n            trees: Arc::new(Mutex::new(trees)),\n            _shutdown_dropper,\n            was_recovered,\n        };\n\n        #[cfg(feature = \"for-internal-testing-only\")]\n        ret.validate()?;\n\n        if let Some(flush_every_ms) = ret.cache.config.flush_every_ms {\n            let spawn_res = std::thread::Builder::new()\n                .name(\"sled_flusher\".into())\n                .spawn(move || flusher(cache, shutdown_rx, flush_every_ms));\n\n            if let Err(e) = spawn_res {\n                return Err(io::Error::other(format!(\n                    \"unable to spawn flusher thread for sled database: {:?}\",\n                    e\n                )));\n            }\n        }\n        Ok(ret)\n    }\n\n    /// A database export method for all collections in the `Db`,\n    /// for use in sled version upgrades. Can be used in combination\n    /// with the `import` method below on a database running a later\n    /// version.\n    ///\n    /// # Panics\n    ///\n    /// Panics if any IO problems occur while trying\n    /// to perform the export.\n    ///\n    /// # Examples\n    ///\n    /// If you want to migrate from one version of sled\n    /// to another, you need to pull in both versions\n    /// by using version renaming:\n    ///\n    /// `Cargo.toml`:\n    ///\n    /// ```toml\n    /// [dependencies]\n    /// sled = \"0.32\"\n    /// old_sled = { version = \"0.31\", package = \"sled\" }\n    /// ```\n    ///\n    /// and in your code, remember that old versions of\n    /// sled might have a different way to open them\n    /// than the current `sled::open` method:\n    ///\n    /// ```\n    /// # use sled as old_sled;\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// let old = old_sled::open(\"my_old_db_export\")?;\n    ///\n    /// // may be a different version of sled,\n    /// // the export type is version agnostic.\n    /// let new = sled::open(\"my_new_db_export\")?;\n    ///\n    /// let export = old.export();\n    /// new.import(export);\n    ///\n    /// assert_eq!(old.checksum()?, new.checksum()?);\n    /// # drop(old);\n    /// # drop(new);\n    /// # let _ = std::fs::remove_dir_all(\"my_old_db_export\");\n    /// # let _ = std::fs::remove_dir_all(\"my_new_db_export\");\n    /// # Ok(()) }\n    /// ```\n    pub fn export(\n        &self,\n    ) -> Vec<(\n        CollectionType,\n        CollectionName,\n        impl Iterator<Item = Vec<Vec<u8>>> + '_,\n    )> {\n        let trees = self.trees.lock();\n\n        let mut ret = vec![];\n\n        for kv_res in self.collection_name_mapping.iter() {\n            let (collection_name, collection_id_buf) = kv_res.unwrap();\n            let collection_id = CollectionId(u64::from_le_bytes(\n                collection_id_buf.as_ref().try_into().unwrap(),\n            ));\n            let tree = trees.get(&collection_id).unwrap().clone();\n\n            ret.push((\n                b\"tree\".to_vec(),\n                collection_name.to_vec(),\n                tree.iter().map(|kv_opt| {\n                    let kv = kv_opt.unwrap();\n                    vec![kv.0.to_vec(), kv.1.to_vec()]\n                }),\n            ));\n        }\n\n        ret\n    }\n\n    /// Imports the collections from a previous database.\n    ///\n    /// # Panics\n    ///\n    /// Panics if any IO problems occur while trying\n    /// to perform the import.\n    ///\n    /// # Examples\n    ///\n    /// If you want to migrate from one version of sled\n    /// to another, you need to pull in both versions\n    /// by using version renaming:\n    ///\n    /// `Cargo.toml`:\n    ///\n    /// ```toml\n    /// [dependencies]\n    /// sled = \"0.32\"\n    /// old_sled = { version = \"0.31\", package = \"sled\" }\n    /// ```\n    ///\n    /// and in your code, remember that old versions of\n    /// sled might have a different way to open them\n    /// than the current `sled::open` method:\n    ///\n    /// ```\n    /// # use sled as old_sled;\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// let old = old_sled::open(\"my_old_db_import\")?;\n    ///\n    /// // may be a different version of sled,\n    /// // the export type is version agnostic.\n    /// let new = sled::open(\"my_new_db_import\")?;\n    ///\n    /// let export = old.export();\n    /// new.import(export);\n    ///\n    /// assert_eq!(old.checksum()?, new.checksum()?);\n    /// # drop(old);\n    /// # drop(new);\n    /// # let _ = std::fs::remove_dir_all(\"my_old_db_import\");\n    /// # let _ = std::fs::remove_dir_all(\"my_new_db_import\");\n    /// # Ok(()) }\n    /// ```\n    pub fn import(\n        &self,\n        export: Vec<(\n            CollectionType,\n            CollectionName,\n            impl Iterator<Item = Vec<Vec<u8>>>,\n        )>,\n    ) {\n        for (collection_type, collection_name, collection_iter) in export {\n            match collection_type {\n                ref t if t == b\"tree\" => {\n                    let tree = self\n                        .open_tree(collection_name)\n                        .expect(\"failed to open new tree during import\");\n                    for mut kv in collection_iter {\n                        let v = kv\n                            .pop()\n                            .expect(\"failed to get value from tree export\");\n                        let k = kv\n                            .pop()\n                            .expect(\"failed to get key from tree export\");\n                        let old = tree.insert(k, v).expect(\n                            \"failed to insert value during tree import\",\n                        );\n                        assert!(\n                            old.is_none(),\n                            \"import is overwriting existing data\"\n                        );\n                    }\n                }\n                other => panic!(\"unknown collection type {:?}\", other),\n            }\n        }\n    }\n\n    pub fn contains_tree<V: AsRef<[u8]>>(&self, name: V) -> io::Result<bool> {\n        Ok(self.collection_name_mapping.get(name.as_ref())?.is_some())\n    }\n\n    pub fn drop_tree<V: AsRef<[u8]>>(&self, name: V) -> io::Result<bool> {\n        let name_ref = name.as_ref();\n        let trees = self.trees.lock();\n\n        let tree = if let Some(collection_id_buf) =\n            self.collection_name_mapping.get(name_ref)?\n        {\n            let collection_id = CollectionId(u64::from_le_bytes(\n                collection_id_buf.as_ref().try_into().unwrap(),\n            ));\n\n            trees.get(&collection_id).unwrap()\n        } else {\n            return Ok(false);\n        };\n\n        tree.clear()?;\n\n        self.collection_name_mapping.remove(name_ref)?;\n\n        Ok(true)\n    }\n\n    /// Open or create a new disk-backed [`Tree`] with its own keyspace,\n    /// accessible from the `Db` via the provided identifier.\n    pub fn open_tree<V: AsRef<[u8]>>(\n        &self,\n        name: V,\n    ) -> io::Result<Tree<LEAF_FANOUT>> {\n        let name_ref = name.as_ref();\n        let mut trees = self.trees.lock();\n\n        if let Some(collection_id_buf) =\n            self.collection_name_mapping.get(name_ref)?\n        {\n            let collection_id = CollectionId(u64::from_le_bytes(\n                collection_id_buf.as_ref().try_into().unwrap(),\n            ));\n\n            let tree = trees.get(&collection_id).unwrap();\n\n            return Ok(tree.clone());\n        }\n\n        let collection_id =\n            CollectionId(self.collection_id_allocator.allocate());\n\n        let initial_low_key = InlineArray::default();\n\n        let empty_node = self.cache.allocate_default_node(collection_id);\n\n        let index = Index::default();\n\n        assert!(index.insert(initial_low_key, empty_node).is_none());\n\n        let tree = Tree::new(\n            collection_id,\n            self.cache.clone(),\n            index,\n            self._shutdown_dropper.clone(),\n        );\n\n        self.collection_name_mapping\n            .insert(name_ref, &collection_id.0.to_le_bytes())?;\n\n        trees.insert(collection_id, tree.clone());\n\n        Ok(tree)\n    }\n}\n\n/// These types provide the information that allows an entire\n/// system to be exported and imported to facilitate\n/// major upgrades. It is comprised entirely\n/// of standard library types to be forward compatible.\n/// NB this definitions are expensive to change, because\n/// they impact the migration path.\ntype CollectionType = Vec<u8>;\ntype CollectionName = Vec<u8>;\n"
  },
  {
    "path": "src/event_verifier.rs",
    "content": "use std::collections::BTreeMap;\nuse std::sync::Mutex;\n\nuse crate::{FlushEpoch, ObjectId};\n\n#[derive(Debug, Clone, Copy, PartialEq, Eq)]\npub(crate) enum State {\n    Unallocated,\n    Dirty,\n    CooperativelySerialized,\n    AddedToWriteBatch,\n    Flushed,\n    CleanPagedIn,\n    PagedOut,\n}\n\nimpl State {\n    fn can_transition_within_epoch_to(&self, next: State) -> bool {\n        match (self, next) {\n            (State::Flushed, State::PagedOut) => true,\n            (State::Flushed, _) => false,\n            (State::AddedToWriteBatch, State::Flushed) => true,\n            (State::AddedToWriteBatch, _) => false,\n            (State::CleanPagedIn, State::AddedToWriteBatch) => false,\n            (State::CleanPagedIn, State::Flushed) => false,\n            (State::Dirty, State::AddedToWriteBatch) => true,\n            (State::CooperativelySerialized, State::AddedToWriteBatch) => true,\n            (State::CooperativelySerialized, _) => false,\n            (State::Unallocated, State::AddedToWriteBatch) => true,\n            (State::Unallocated, _) => false,\n            (State::Dirty, State::Dirty) => true,\n            (State::Dirty, State::CooperativelySerialized) => true,\n            (State::Dirty, State::Unallocated) => true,\n            (State::Dirty, _) => false,\n            (State::CleanPagedIn, State::Dirty) => true,\n            (State::CleanPagedIn, State::PagedOut) => true,\n            (State::CleanPagedIn, State::CleanPagedIn) => true,\n            (State::CleanPagedIn, State::Unallocated) => true,\n            (State::CleanPagedIn, State::CooperativelySerialized) => true,\n            (State::PagedOut, State::CleanPagedIn) => true,\n            (State::PagedOut, _) => false,\n        }\n    }\n\n    fn needs_flush(&self) -> bool {\n        match self {\n            State::CleanPagedIn => false,\n            State::Flushed => false,\n            State::PagedOut => false,\n            _ => true,\n        }\n    }\n}\n\n#[derive(Debug, Default)]\npub(crate) struct EventVerifier {\n    flush_model:\n        Mutex<BTreeMap<(ObjectId, FlushEpoch), Vec<(State, &'static str)>>>,\n}\n\nimpl Drop for EventVerifier {\n    fn drop(&mut self) {\n        // assert that nothing is currently Dirty\n        let flush_model = self.flush_model.lock().unwrap();\n        for ((oid, _epoch), history) in flush_model.iter() {\n            if let Some((last_state, _at)) = history.last() {\n                assert_ne!(\n                    *last_state,\n                    State::Dirty,\n                    \"{oid:?} is Dirty when system shutting down\"\n                );\n            }\n        }\n    }\n}\n\nimpl EventVerifier {\n    pub(crate) fn mark(\n        &self,\n        object_id: ObjectId,\n        epoch: FlushEpoch,\n        state: State,\n        at: &'static str,\n    ) {\n        if matches!(state, State::PagedOut) {\n            let dirty_epochs = self.dirty_epochs(object_id);\n            if !dirty_epochs.is_empty() {\n                println!(\"{object_id:?} was paged out while having dirty epochs {dirty_epochs:?}\");\n                self.print_debug_history_for_object(object_id);\n                println!(\"{state:?} {epoch:?} {at}\");\n                println!(\"invalid object state transition\");\n                std::process::abort();\n            }\n        }\n\n        let mut flush_model = self.flush_model.lock().unwrap();\n        let history = flush_model.entry((object_id, epoch)).or_default();\n\n        if let Some((last_state, _at)) = history.last() {\n            if !last_state.can_transition_within_epoch_to(state) {\n                println!(\n                    \"object_id {object_id:?} performed \\\n                    illegal state transition from {last_state:?} \\\n                    to {state:?} at {at} in epoch {epoch:?}.\"\n                );\n\n                println!(\"history:\");\n                history.push((state, at));\n\n                let active_epochs = flush_model.range(\n                    (object_id, FlushEpoch::MIN)..=(object_id, FlushEpoch::MAX),\n                );\n                for ((_oid, epoch), history) in active_epochs {\n                    for (last_state, at) in history {\n                        println!(\"{last_state:?} {epoch:?} {at}\");\n                    }\n                }\n\n                println!(\"invalid object state transition\");\n\n                std::process::abort();\n            }\n        }\n        history.push((state, at));\n    }\n\n    /// Returns the FlushEpochs for which this ObjectId has unflushed\n    /// dirty data for.\n    fn dirty_epochs(&self, object_id: ObjectId) -> Vec<FlushEpoch> {\n        let mut dirty_epochs = vec![];\n        let flush_model = self.flush_model.lock().unwrap();\n\n        let active_epochs = flush_model\n            .range((object_id, FlushEpoch::MIN)..=(object_id, FlushEpoch::MAX));\n\n        for ((_oid, epoch), history) in active_epochs {\n            let (last_state, _at) = history.last().unwrap();\n            if last_state.needs_flush() {\n                dirty_epochs.push(*epoch);\n            }\n        }\n\n        dirty_epochs\n    }\n\n    pub(crate) fn print_debug_history_for_object(&self, object_id: ObjectId) {\n        let flush_model = self.flush_model.lock().unwrap();\n        println!(\"history for object {:?}:\", object_id);\n        let active_epochs = flush_model\n            .range((object_id, FlushEpoch::MIN)..=(object_id, FlushEpoch::MAX));\n        for ((_oid, epoch), history) in active_epochs {\n            for (last_state, at) in history {\n                println!(\"{last_state:?} {epoch:?} {at}\");\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "src/flush_epoch.rs",
    "content": "use std::num::NonZeroU64;\nuse std::sync::atomic::{AtomicPtr, AtomicU64, Ordering};\nuse std::sync::{Arc, Condvar, Mutex};\n\nconst SEAL_BIT: u64 = 1 << 63;\nconst SEAL_MASK: u64 = u64::MAX - SEAL_BIT;\nconst MIN_EPOCH: u64 = 2;\n\n#[derive(\n    Debug,\n    Clone,\n    Copy,\n    serde::Serialize,\n    serde::Deserialize,\n    PartialOrd,\n    Ord,\n    PartialEq,\n    Eq,\n    Hash,\n)]\npub struct FlushEpoch(NonZeroU64);\n\nimpl FlushEpoch {\n    pub const MIN: FlushEpoch = FlushEpoch(NonZeroU64::MIN);\n    #[allow(unused)]\n    pub const MAX: FlushEpoch = FlushEpoch(NonZeroU64::MAX);\n\n    pub fn increment(&self) -> FlushEpoch {\n        FlushEpoch(NonZeroU64::new(self.0.get() + 1).unwrap())\n    }\n\n    pub fn get(&self) -> u64 {\n        self.0.get()\n    }\n}\n\nimpl concurrent_map::Minimum for FlushEpoch {\n    const MIN: FlushEpoch = FlushEpoch::MIN;\n}\n\n#[derive(Debug)]\npub(crate) struct FlushInvariants {\n    max_flushed_epoch: AtomicU64,\n    max_flushing_epoch: AtomicU64,\n}\n\nimpl Default for FlushInvariants {\n    fn default() -> FlushInvariants {\n        FlushInvariants {\n            max_flushed_epoch: (MIN_EPOCH - 1).into(),\n            max_flushing_epoch: (MIN_EPOCH - 1).into(),\n        }\n    }\n}\n\nimpl FlushInvariants {\n    pub(crate) fn mark_flushed_epoch(&self, epoch: FlushEpoch) {\n        let last = self.max_flushed_epoch.swap(epoch.get(), Ordering::SeqCst);\n\n        assert_eq!(last + 1, epoch.get());\n    }\n\n    pub(crate) fn mark_flushing_epoch(&self, epoch: FlushEpoch) {\n        let last = self.max_flushing_epoch.swap(epoch.get(), Ordering::SeqCst);\n\n        assert_eq!(last + 1, epoch.get());\n    }\n}\n\n#[derive(Clone, Debug)]\npub(crate) struct Completion {\n    mu: Arc<Mutex<bool>>,\n    cv: Arc<Condvar>,\n    epoch: FlushEpoch,\n}\n\nimpl Completion {\n    pub fn epoch(&self) -> FlushEpoch {\n        self.epoch\n    }\n\n    pub fn new(epoch: FlushEpoch) -> Completion {\n        Completion { mu: Default::default(), cv: Default::default(), epoch }\n    }\n\n    pub fn wait_for_complete(self) -> FlushEpoch {\n        let mut mu = self.mu.lock().unwrap();\n        while !*mu {\n            mu = self.cv.wait(mu).unwrap();\n        }\n\n        self.epoch\n    }\n\n    pub fn mark_complete(self) {\n        self.mark_complete_inner(false);\n    }\n\n    fn mark_complete_inner(&self, previously_sealed: bool) {\n        let mut mu = self.mu.lock().unwrap();\n        if !previously_sealed {\n            // TODO reevaluate - assert!(!*mu);\n        }\n        log::trace!(\"marking epoch {:?} as complete\", self.epoch);\n        // it's possible for *mu to already be true due to this being\n        // immediately dropped in the check_in method when we see that\n        // the checked-in epoch has already been marked as sealed.\n        *mu = true;\n        drop(mu);\n        self.cv.notify_all();\n    }\n\n    #[cfg(test)]\n    pub fn is_complete(&self) -> bool {\n        *self.mu.lock().unwrap()\n    }\n}\n\npub struct FlushEpochGuard<'a> {\n    tracker: &'a EpochTracker,\n    previously_sealed: bool,\n}\n\nimpl Drop for FlushEpochGuard<'_> {\n    fn drop(&mut self) {\n        let rc = self.tracker.rc.fetch_sub(1, Ordering::SeqCst) - 1;\n        if rc & SEAL_MASK == 0 && (rc & SEAL_BIT) == SEAL_BIT {\n            crate::debug_delay();\n            self.tracker\n                .vacancy_notifier\n                .mark_complete_inner(self.previously_sealed);\n        }\n    }\n}\n\nimpl FlushEpochGuard<'_> {\n    pub fn epoch(&self) -> FlushEpoch {\n        self.tracker.epoch\n    }\n}\n\n#[derive(Debug)]\npub(crate) struct EpochTracker {\n    epoch: FlushEpoch,\n    rc: AtomicU64,\n    vacancy_notifier: Completion,\n    previous_flush_complete: Completion,\n}\n\n#[derive(Clone, Debug)]\npub(crate) struct FlushEpochTracker {\n    active_ebr: ebr::Ebr<Box<EpochTracker>, 16, 16>,\n    inner: Arc<FlushEpochInner>,\n}\n\n#[derive(Debug)]\npub(crate) struct FlushEpochInner {\n    counter: AtomicU64,\n    roll_mu: Mutex<()>,\n    current_active: AtomicPtr<EpochTracker>,\n}\n\nimpl Drop for FlushEpochInner {\n    fn drop(&mut self) {\n        let vacancy_mu = self.roll_mu.lock().unwrap();\n        let old_ptr =\n            self.current_active.swap(std::ptr::null_mut(), Ordering::SeqCst);\n        if !old_ptr.is_null() {\n            //let old: &EpochTracker = &*old_ptr;\n            unsafe { drop(Box::from_raw(old_ptr)) }\n        }\n        drop(vacancy_mu);\n    }\n}\n\nimpl Default for FlushEpochTracker {\n    fn default() -> FlushEpochTracker {\n        let last = Completion::new(FlushEpoch(NonZeroU64::new(1).unwrap()));\n        let current_active_ptr = Box::into_raw(Box::new(EpochTracker {\n            epoch: FlushEpoch(NonZeroU64::new(MIN_EPOCH).unwrap()),\n            rc: AtomicU64::new(0),\n            vacancy_notifier: Completion::new(FlushEpoch(\n                NonZeroU64::new(MIN_EPOCH).unwrap(),\n            )),\n            previous_flush_complete: last.clone(),\n        }));\n\n        last.mark_complete();\n\n        let current_active = AtomicPtr::new(current_active_ptr);\n\n        FlushEpochTracker {\n            inner: Arc::new(FlushEpochInner {\n                counter: AtomicU64::new(2),\n                roll_mu: Mutex::new(()),\n                current_active,\n            }),\n            active_ebr: ebr::Ebr::default(),\n        }\n    }\n}\n\nimpl FlushEpochTracker {\n    /// Returns the epoch notifier for the previous epoch.\n    /// Intended to be passed to a flusher that can eventually\n    /// notify the flush-requesting thread.\n    pub fn roll_epoch_forward(&self) -> (Completion, Completion, Completion) {\n        let mut tracker_guard = self.active_ebr.pin();\n\n        let vacancy_mu = self.inner.roll_mu.lock().unwrap();\n\n        let flush_through = self.inner.counter.fetch_add(1, Ordering::SeqCst);\n\n        let flush_through_epoch =\n            FlushEpoch(NonZeroU64::new(flush_through).unwrap());\n\n        let new_epoch = flush_through_epoch.increment();\n\n        let forward_flush_notifier = Completion::new(flush_through_epoch);\n\n        let new_active = Box::into_raw(Box::new(EpochTracker {\n            epoch: new_epoch,\n            rc: AtomicU64::new(0),\n            vacancy_notifier: Completion::new(new_epoch),\n            previous_flush_complete: forward_flush_notifier.clone(),\n        }));\n\n        let old_ptr =\n            self.inner.current_active.swap(new_active, Ordering::SeqCst);\n\n        assert!(!old_ptr.is_null());\n\n        let (last_flush_complete_notifier, vacancy_notifier) = unsafe {\n            let old: &EpochTracker = &*old_ptr;\n            let last = old.rc.fetch_add(SEAL_BIT + 1, Ordering::SeqCst);\n\n            assert_eq!(\n                last & SEAL_BIT,\n                0,\n                \"epoch {} double-sealed\",\n                flush_through\n            );\n\n            // mark_complete_inner called via drop in a uniform way\n            //println!(\"dropping flush epoch guard for epoch {flush_through}\");\n            drop(FlushEpochGuard { tracker: old, previously_sealed: true });\n\n            (old.previous_flush_complete.clone(), old.vacancy_notifier.clone())\n        };\n        tracker_guard.defer_drop(unsafe { Box::from_raw(old_ptr) });\n        drop(vacancy_mu);\n        (last_flush_complete_notifier, vacancy_notifier, forward_flush_notifier)\n    }\n\n    pub fn check_in<'a>(&self) -> FlushEpochGuard<'a> {\n        let _tracker_guard = self.active_ebr.pin();\n        loop {\n            let tracker: &'a EpochTracker =\n                unsafe { &*self.inner.current_active.load(Ordering::SeqCst) };\n\n            let rc = tracker.rc.fetch_add(1, Ordering::SeqCst);\n\n            let previously_sealed = rc & SEAL_BIT == SEAL_BIT;\n\n            let guard = FlushEpochGuard { tracker, previously_sealed };\n\n            if previously_sealed {\n                // the epoch is already closed, so we must drop the rc\n                // and possibly notify, which is handled in the guard's\n                // Drop impl.\n                drop(guard);\n            } else {\n                return guard;\n            }\n        }\n    }\n\n    pub fn manually_advance_epoch(&self) {\n        self.active_ebr.manually_advance_epoch();\n    }\n\n    pub fn current_flush_epoch(&self) -> FlushEpoch {\n        let current = self.inner.counter.load(Ordering::SeqCst);\n\n        FlushEpoch(NonZeroU64::new(current).unwrap())\n    }\n}\n\n#[test]\nfn flush_epoch_basic_functionality() {\n    let epoch_tracker = FlushEpochTracker::default();\n\n    for expected in MIN_EPOCH..1_000_000 {\n        let g1 = epoch_tracker.check_in();\n        let g2 = epoch_tracker.check_in();\n\n        assert_eq!(g1.tracker.epoch.0.get(), expected);\n        assert_eq!(g2.tracker.epoch.0.get(), expected);\n\n        let previous_notifier = epoch_tracker.roll_epoch_forward().1;\n        assert!(!previous_notifier.is_complete());\n\n        drop(g1);\n        assert!(!previous_notifier.is_complete());\n        drop(g2);\n        assert_eq!(previous_notifier.wait_for_complete().0.get(), expected);\n    }\n}\n\n#[cfg(test)]\nfn concurrent_flush_epoch_burn_in_inner() {\n    const N_THREADS: usize = 10;\n    const N_OPS_PER_THREAD: usize = 3000;\n\n    let fa = FlushEpochTracker::default();\n\n    let barrier = std::sync::Arc::new(std::sync::Barrier::new(21));\n\n    let pt = pagetable::PageTable::<AtomicU64>::default();\n\n    let rolls = || {\n        let fa = fa.clone();\n        let barrier = barrier.clone();\n        let pt = &pt;\n        move || {\n            barrier.wait();\n            for _ in 0..N_OPS_PER_THREAD {\n                let (previous, this, next) = fa.roll_epoch_forward();\n                let last_epoch = previous.wait_for_complete().0.get();\n                assert_eq!(0, pt.get(last_epoch).load(Ordering::Acquire));\n                let flush_through_epoch = this.wait_for_complete().0.get();\n                assert_eq!(\n                    0,\n                    pt.get(flush_through_epoch).load(Ordering::Acquire)\n                );\n\n                next.mark_complete();\n            }\n        }\n    };\n\n    let check_ins = || {\n        let fa = fa.clone();\n        let barrier = barrier.clone();\n        let pt = &pt;\n        move || {\n            barrier.wait();\n            for _ in 0..N_OPS_PER_THREAD {\n                let guard = fa.check_in();\n                let epoch = guard.epoch().0.get();\n                pt.get(epoch).fetch_add(1, Ordering::SeqCst);\n                std::thread::yield_now();\n                pt.get(epoch).fetch_sub(1, Ordering::SeqCst);\n                drop(guard);\n            }\n        }\n    };\n\n    std::thread::scope(|s| {\n        let mut threads = vec![];\n\n        for _ in 0..N_THREADS {\n            threads.push(s.spawn(rolls()));\n            threads.push(s.spawn(check_ins()));\n        }\n\n        barrier.wait();\n\n        for thread in threads.into_iter() {\n            thread.join().expect(\"a test thread crashed unexpectedly\");\n        }\n    });\n\n    for i in 0..N_OPS_PER_THREAD * N_THREADS {\n        assert_eq!(0, pt.get(i as u64).load(Ordering::Acquire));\n    }\n}\n\n#[test]\nfn concurrent_flush_epoch_burn_in() {\n    for _ in 0..128 {\n        concurrent_flush_epoch_burn_in_inner();\n    }\n}\n"
  },
  {
    "path": "src/heap.rs",
    "content": "use std::fmt;\nuse std::fs;\nuse std::io::{self, Read};\nuse std::num::NonZeroU64;\nuse std::path::{Path, PathBuf};\nuse std::sync::Arc;\nuse std::sync::atomic::{AtomicPtr, AtomicU64, Ordering, fence};\nuse std::time::{Duration, Instant};\n\nuse ebr::{Ebr, Guard};\nuse fault_injection::{annotate, fallible, maybe};\nuse fnv::FnvHashSet;\nuse fs2::FileExt as _;\nuse parking_lot::{Mutex, RwLock};\nuse rayon::prelude::*;\n\nuse crate::object_location_mapper::{AllocatorStats, ObjectLocationMapper};\nuse crate::{CollectionId, Config, DeferredFree, MetadataStore, ObjectId};\n\nconst WARN: &str = \"DO_NOT_PUT_YOUR_FILES_HERE\";\npub(crate) const N_SLABS: usize = 78;\nconst FILE_TARGET_FILL_RATIO: u64 = 80;\nconst FILE_RESIZE_MARGIN: u64 = 115;\n\nconst SLAB_SIZES: [usize; N_SLABS] = [\n    64,     // 0x40\n    80,     // 0x50\n    96,     // 0x60\n    112,    // 0x70\n    128,    // 0x80\n    160,    // 0xa0\n    192,    // 0xc0\n    224,    // 0xe0\n    256,    // 0x100\n    320,    // 0x140\n    384,    // 0x180\n    448,    // 0x1c0\n    512,    // 0x200\n    640,    // 0x280\n    768,    // 0x300\n    896,    // 0x380\n    1024,   // 0x400\n    1280,   // 0x500\n    1536,   // 0x600\n    1792,   // 0x700\n    2048,   // 0x800\n    2560,   // 0xa00\n    3072,   // 0xc00\n    3584,   // 0xe00\n    4096,   // 0x1000\n    5120,   // 0x1400\n    6144,   // 0x1800\n    7168,   // 0x1c00\n    8192,   // 0x2000\n    10240,  // 0x2800\n    12288,  // 0x3000\n    14336,  // 0x3800\n    16384,  // 0x4000\n    20480,  // 0x5000\n    24576,  // 0x6000\n    28672,  // 0x7000\n    32768,  // 0x8000\n    40960,  // 0xa000\n    49152,  // 0xc000\n    57344,  // 0xe000\n    65536,  // 0x10000\n    98304,  // 0x1a000\n    131072, // 0x20000\n    163840, // 0x28000\n    196608,\n    262144,\n    393216,\n    524288,\n    786432,\n    1048576,\n    1572864,\n    2097152,\n    3145728,\n    4194304,\n    6291456,\n    8388608,\n    12582912,\n    16777216,\n    25165824,\n    33554432,\n    50331648,\n    67108864,\n    100663296,\n    134217728,\n    201326592,\n    268435456,\n    402653184,\n    536870912,\n    805306368,\n    1073741824,\n    1610612736,\n    2147483648,\n    3221225472,\n    4294967296,\n    6442450944,\n    8589934592,\n    12884901888,\n    17_179_869_184, // 17gb is max page size as-of now\n];\n\n#[derive(Default, Debug, Copy, Clone)]\npub struct WriteBatchStats {\n    pub heap_bytes_written: u64,\n    pub heap_files_written_to: u64,\n    /// Latency inclusive of fsync\n    pub heap_write_latency: Duration,\n    /// Latency for fsyncing files\n    pub heap_sync_latency: Duration,\n    pub metadata_bytes_written: u64,\n    pub metadata_write_latency: Duration,\n    pub truncated_files: u64,\n    pub truncated_bytes: u64,\n    pub truncate_latency: Duration,\n}\n\n#[derive(Default, Debug, Clone, Copy)]\npub struct HeapStats {\n    pub allocator: AllocatorStats,\n    pub write_batch_max: WriteBatchStats,\n    pub write_batch_sum: WriteBatchStats,\n    pub truncated_file_bytes: u64,\n}\n\nimpl WriteBatchStats {\n    pub(crate) fn max(&self, other: &WriteBatchStats) -> WriteBatchStats {\n        WriteBatchStats {\n            heap_bytes_written: self\n                .heap_bytes_written\n                .max(other.heap_bytes_written),\n            heap_files_written_to: self\n                .heap_files_written_to\n                .max(other.heap_files_written_to),\n            heap_write_latency: self\n                .heap_write_latency\n                .max(other.heap_write_latency),\n            heap_sync_latency: self\n                .heap_sync_latency\n                .max(other.heap_sync_latency),\n            metadata_bytes_written: self\n                .metadata_bytes_written\n                .max(other.metadata_bytes_written),\n            metadata_write_latency: self\n                .metadata_write_latency\n                .max(other.metadata_write_latency),\n            truncated_files: self.truncated_files.max(other.truncated_files),\n            truncated_bytes: self.truncated_bytes.max(other.truncated_bytes),\n            truncate_latency: self.truncate_latency.max(other.truncate_latency),\n        }\n    }\n\n    pub(crate) fn sum(&self, other: &WriteBatchStats) -> WriteBatchStats {\n        use std::ops::Add;\n        WriteBatchStats {\n            heap_bytes_written: self\n                .heap_bytes_written\n                .add(other.heap_bytes_written),\n            heap_files_written_to: self\n                .heap_files_written_to\n                .add(other.heap_files_written_to),\n            heap_write_latency: self\n                .heap_write_latency\n                .add(other.heap_write_latency),\n            heap_sync_latency: self\n                .heap_sync_latency\n                .add(other.heap_sync_latency),\n            metadata_bytes_written: self\n                .metadata_bytes_written\n                .add(other.metadata_bytes_written),\n            metadata_write_latency: self\n                .metadata_write_latency\n                .add(other.metadata_write_latency),\n            truncated_files: self.truncated_files.add(other.truncated_files),\n            truncated_bytes: self.truncated_bytes.add(other.truncated_bytes),\n            truncate_latency: self.truncate_latency.add(other.truncate_latency),\n        }\n    }\n}\n\nconst fn overhead_for_size(size: usize) -> usize {\n    if size + 5 <= u8::MAX as usize {\n        // crc32 + 1 byte frame\n        5\n    } else if size + 6 <= u16::MAX as usize {\n        // crc32 + 2 byte frame\n        6\n    } else if size + 8 <= u32::MAX as usize {\n        // crc32 + 4 byte frame\n        8\n    } else {\n        // crc32 + 8 byte frame\n        12\n    }\n}\n\nfn slab_for_size(size: usize) -> u8 {\n    let total_size = size + overhead_for_size(size);\n    for (idx, slab_size) in SLAB_SIZES.iter().enumerate() {\n        if *slab_size >= total_size {\n            return u8::try_from(idx).unwrap();\n        }\n    }\n    u8::MAX\n}\n\npub use inline_array::InlineArray;\n\n#[derive(Debug)]\npub struct ObjectRecovery {\n    pub object_id: ObjectId,\n    pub collection_id: CollectionId,\n    pub low_key: InlineArray,\n}\n\npub struct HeapRecovery {\n    pub heap: Heap,\n    pub recovered_nodes: Vec<ObjectRecovery>,\n    pub was_recovered: bool,\n}\n\nenum PersistentSettings {\n    V1 { leaf_fanout: u64 },\n}\n\nimpl PersistentSettings {\n    // NB: should only be called with a directory lock already exclusively acquired\n    fn verify_or_store<P: AsRef<Path>>(\n        &self,\n        path: P,\n        _directory_lock: &std::fs::File,\n    ) -> io::Result<()> {\n        let settings_path = path.as_ref().join(\"durability_cookie\");\n\n        match std::fs::read(&settings_path) {\n            Ok(previous_bytes) => {\n                let previous =\n                    PersistentSettings::deserialize(&previous_bytes)?;\n                self.check_compatibility(&previous)\n            }\n            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {\n                std::fs::write(settings_path, self.serialize())\n            }\n            Err(e) => Err(e),\n        }\n    }\n\n    fn deserialize(buf: &[u8]) -> io::Result<PersistentSettings> {\n        let mut cursor = buf;\n        let mut buf = [0_u8; 64];\n        cursor.read_exact(&mut buf)?;\n\n        let version = u16::from_le_bytes([buf[0], buf[1]]);\n\n        let crc_actual = (crc32fast::hash(&buf[0..60]) ^ 0xAF).to_le_bytes();\n        let crc_expected = &buf[60..];\n\n        if crc_actual != crc_expected {\n            return Err(io::Error::new(\n                io::ErrorKind::InvalidData,\n                \"encountered corrupted settings cookie with mismatched CRC.\",\n            ));\n        }\n\n        match version {\n            1 => {\n                let leaf_fanout =\n                    u64::from_le_bytes(buf[2..10].try_into().unwrap());\n                Ok(PersistentSettings::V1 { leaf_fanout })\n            }\n            _ => Err(io::Error::new(\n                io::ErrorKind::InvalidData,\n                \"encountered unknown version number when reading settings cookie\",\n            )),\n        }\n    }\n\n    fn check_compatibility(\n        &self,\n        other: &PersistentSettings,\n    ) -> io::Result<()> {\n        use PersistentSettings::*;\n\n        match (self, other) {\n            (V1 { leaf_fanout: lf1 }, V1 { leaf_fanout: lf2 }) => {\n                if lf1 != lf2 {\n                    Err(io::Error::new(\n                        io::ErrorKind::Unsupported,\n                        format!(\n                            \"sled was already opened with a LEAF_FANOUT const generic of {}, \\\n                                and this may not be changed after initial creation. Please use \\\n                                Db::import / Db::export to migrate, if you wish to change the \\\n                                system's format.\",\n                            lf2\n                        ),\n                    ))\n                } else {\n                    Ok(())\n                }\n            }\n        }\n    }\n\n    fn serialize(&self) -> Vec<u8> {\n        // format: 64 bytes in total, with the last 4 being a LE crc32\n        // first 2 are LE version number\n        let mut buf = vec![];\n\n        match self {\n            PersistentSettings::V1 { leaf_fanout } => {\n                // LEAF_FANOUT: 8 bytes LE\n                let version: [u8; 2] = 1_u16.to_le_bytes();\n                buf.extend_from_slice(&version);\n\n                buf.extend_from_slice(&leaf_fanout.to_le_bytes());\n            }\n        }\n\n        // zero-pad the buffer\n        assert!(buf.len() < 60);\n        buf.resize(60, 0);\n\n        let hash: u32 = crc32fast::hash(&buf) ^ 0xAF;\n        let hash_bytes: [u8; 4] = hash.to_le_bytes();\n        buf.extend_from_slice(&hash_bytes);\n\n        // keep the buffer to 64 bytes for easy parsing over time.\n        assert_eq!(buf.len(), 64);\n\n        buf\n    }\n}\n\n#[derive(Clone, Copy, Debug, PartialEq)]\npub(crate) struct SlabAddress {\n    slab_id: u8,\n    slab_slot: [u8; 7],\n}\n\nimpl SlabAddress {\n    pub(crate) fn from_slab_slot(slab: u8, slot: u64) -> SlabAddress {\n        let slot_bytes = slot.to_be_bytes();\n\n        assert_eq!(slot_bytes[0], 0);\n\n        SlabAddress {\n            slab_id: slab,\n            slab_slot: slot_bytes[1..].try_into().unwrap(),\n        }\n    }\n\n    #[inline]\n    pub const fn slab(&self) -> u8 {\n        self.slab_id\n    }\n\n    #[inline]\n    pub const fn slot(&self) -> u64 {\n        u64::from_be_bytes([\n            0,\n            self.slab_slot[0],\n            self.slab_slot[1],\n            self.slab_slot[2],\n            self.slab_slot[3],\n            self.slab_slot[4],\n            self.slab_slot[5],\n            self.slab_slot[6],\n        ])\n    }\n}\n\nimpl From<NonZeroU64> for SlabAddress {\n    fn from(i: NonZeroU64) -> SlabAddress {\n        let i = i.get();\n        let bytes = i.to_be_bytes();\n        SlabAddress {\n            slab_id: bytes[0] - 1,\n            slab_slot: bytes[1..].try_into().unwrap(),\n        }\n    }\n}\n\nimpl From<SlabAddress> for NonZeroU64 {\n    fn from(sa: SlabAddress) -> NonZeroU64 {\n        NonZeroU64::new(u64::from_be_bytes([\n            sa.slab_id + 1,\n            sa.slab_slot[0],\n            sa.slab_slot[1],\n            sa.slab_slot[2],\n            sa.slab_slot[3],\n            sa.slab_slot[4],\n            sa.slab_slot[5],\n            sa.slab_slot[6],\n        ]))\n        .unwrap()\n    }\n}\n\n#[cfg(unix)]\nmod sys_io {\n    use std::io;\n    use std::os::unix::fs::FileExt;\n\n    use super::*;\n\n    pub(super) fn read_exact_at(\n        file: &fs::File,\n        buf: &mut [u8],\n        offset: u64,\n    ) -> io::Result<()> {\n        match maybe!(file.read_exact_at(buf, offset)) {\n            Ok(r) => Ok(r),\n            Err(e) => {\n                // FIXME BUG 3: failed to read 64 bytes at offset 192 from file with len 192\n                println!(\n                    \"failed to read {} bytes at offset {} from file with len {}\",\n                    buf.len(),\n                    offset,\n                    file.metadata().unwrap().len(),\n                );\n                let _ = dbg!(std::backtrace::Backtrace::force_capture());\n                Err(e)\n            }\n        }\n    }\n\n    pub(super) fn write_all_at(\n        file: &fs::File,\n        buf: &[u8],\n        offset: u64,\n    ) -> io::Result<()> {\n        maybe!(file.write_all_at(buf, offset))\n    }\n}\n\n#[cfg(windows)]\nmod sys_io {\n    use std::os::windows::fs::FileExt;\n\n    use super::*;\n\n    pub(super) fn read_exact_at(\n        file: &fs::File,\n        mut buf: &mut [u8],\n        mut offset: u64,\n    ) -> io::Result<()> {\n        while !buf.is_empty() {\n            match maybe!(file.seek_read(buf, offset)) {\n                Ok(0) => break,\n                Ok(n) => {\n                    let tmp = buf;\n                    buf = &mut tmp[n..];\n                    offset += n as u64;\n                }\n                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}\n                Err(e) => return Err(annotate!(e)),\n            }\n        }\n        if !buf.is_empty() {\n            Err(annotate!(io::Error::new(\n                io::ErrorKind::UnexpectedEof,\n                \"failed to fill whole buffer\"\n            )))\n        } else {\n            Ok(())\n        }\n    }\n\n    pub(super) fn write_all_at(\n        file: &fs::File,\n        mut buf: &[u8],\n        mut offset: u64,\n    ) -> io::Result<()> {\n        while !buf.is_empty() {\n            match maybe!(file.seek_write(buf, offset)) {\n                Ok(0) => {\n                    return Err(annotate!(io::Error::new(\n                        io::ErrorKind::WriteZero,\n                        \"failed to write whole buffer\",\n                    )));\n                }\n                Ok(n) => {\n                    buf = &buf[n..];\n                    offset += n as u64;\n                }\n                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}\n                Err(e) => return Err(annotate!(e)),\n            }\n        }\n        Ok(())\n    }\n}\n\n#[derive(Debug)]\nstruct Slab {\n    file: fs::File,\n    slot_size: usize,\n    max_live_slot_since_last_truncation: AtomicU64,\n}\n\nimpl Slab {\n    fn sync(&self) -> io::Result<()> {\n        self.file.sync_all()\n    }\n\n    fn read(\n        &self,\n        slot: u64,\n        _guard: &mut Guard<'_, DeferredFree, 16, 16>,\n    ) -> io::Result<Vec<u8>> {\n        log::trace!(\"reading from slot {} in slab {}\", slot, self.slot_size);\n\n        let mut data = Vec::with_capacity(self.slot_size);\n        unsafe {\n            data.set_len(self.slot_size);\n        }\n\n        let whence = self.slot_size as u64 * slot;\n\n        maybe!(sys_io::read_exact_at(&self.file, &mut data, whence))?;\n\n        let hash_actual: [u8; 4] =\n            (crc32fast::hash(&data[..self.slot_size - 4]) ^ 0xAF).to_le_bytes();\n        let hash_expected = &data[self.slot_size - 4..];\n\n        if hash_expected != hash_actual {\n            return Err(annotate!(io::Error::new(\n                io::ErrorKind::InvalidData,\n                \"crc mismatch - data corruption detected\"\n            )));\n        }\n\n        let len: usize = if self.slot_size <= u8::MAX as usize {\n            // crc32 + 1 byte frame\n            usize::from(data[self.slot_size - 5])\n        } else if self.slot_size <= u16::MAX as usize {\n            // crc32 + 2 byte frame\n            let mut size_bytes: [u8; 2] = [0; 2];\n            size_bytes\n                .copy_from_slice(&data[self.slot_size - 6..self.slot_size - 4]);\n            usize::from(u16::from_le_bytes(size_bytes))\n        } else if self.slot_size <= u32::MAX as usize {\n            // crc32 + 4 byte frame\n            let mut size_bytes: [u8; 4] = [0; 4];\n            size_bytes\n                .copy_from_slice(&data[self.slot_size - 8..self.slot_size - 4]);\n            usize::try_from(u32::from_le_bytes(size_bytes)).unwrap()\n        } else {\n            // crc32 + 8 byte frame\n            let mut size_bytes: [u8; 8] = [0; 8];\n            size_bytes.copy_from_slice(\n                &data[self.slot_size - 12..self.slot_size - 4],\n            );\n            usize::try_from(u64::from_le_bytes(size_bytes)).unwrap()\n        };\n\n        data.truncate(len);\n\n        Ok(data)\n    }\n\n    fn write(&self, slot: u64, mut data: Vec<u8>) -> io::Result<()> {\n        let len = data.len();\n\n        assert!(len + overhead_for_size(data.len()) <= self.slot_size);\n\n        data.resize(self.slot_size, 0);\n\n        if self.slot_size <= u8::MAX as usize {\n            // crc32 + 1 byte frame\n            data[self.slot_size - 5] = u8::try_from(len).unwrap();\n        } else if self.slot_size <= u16::MAX as usize {\n            // crc32 + 2 byte frame\n            let size_bytes: [u8; 2] = u16::try_from(len).unwrap().to_le_bytes();\n            data[self.slot_size - 6..self.slot_size - 4]\n                .copy_from_slice(&size_bytes);\n        } else if self.slot_size <= u32::MAX as usize {\n            // crc32 + 4 byte frame\n            let size_bytes: [u8; 4] = u32::try_from(len).unwrap().to_le_bytes();\n            data[self.slot_size - 8..self.slot_size - 4]\n                .copy_from_slice(&size_bytes);\n        } else {\n            // crc32 + 8 byte frame\n            let size_bytes: [u8; 8] = u64::try_from(len).unwrap().to_le_bytes();\n            data[self.slot_size - 12..self.slot_size - 4]\n                .copy_from_slice(&size_bytes);\n        }\n\n        let hash: [u8; 4] =\n            (crc32fast::hash(&data[..self.slot_size - 4]) ^ 0xAF).to_le_bytes();\n        data[self.slot_size - 4..].copy_from_slice(&hash);\n\n        let whence = self.slot_size as u64 * slot;\n\n        log::trace!(\"writing to slot {} in slab {}\", slot, self.slot_size);\n        sys_io::write_all_at(&self.file, &data, whence)\n    }\n}\n\nfn set_error(\n    global_error: &AtomicPtr<(io::ErrorKind, String)>,\n    error: &io::Error,\n) {\n    let kind = error.kind();\n    let reason = error.to_string();\n\n    let boxed = Box::new((kind, reason));\n    let ptr = Box::into_raw(boxed);\n\n    if global_error\n        .compare_exchange(\n            std::ptr::null_mut(),\n            ptr,\n            Ordering::SeqCst,\n            Ordering::SeqCst,\n        )\n        .is_err()\n    {\n        // global fatal error already installed, drop this one\n        unsafe {\n            drop(Box::from_raw(ptr));\n        }\n    }\n}\n\n#[derive(Debug)]\npub enum Update {\n    Store {\n        object_id: ObjectId,\n        collection_id: CollectionId,\n        low_key: InlineArray,\n        data: Vec<u8>,\n    },\n    Free {\n        object_id: ObjectId,\n        collection_id: CollectionId,\n    },\n}\n\nimpl Update {\n    #[allow(unused)]\n    pub(crate) fn object_id(&self) -> ObjectId {\n        match self {\n            Update::Store { object_id, .. }\n            | Update::Free { object_id, .. } => *object_id,\n        }\n    }\n}\n\n#[derive(Debug, PartialOrd, Ord, PartialEq, Eq)]\npub enum UpdateMetadata {\n    Store {\n        object_id: ObjectId,\n        collection_id: CollectionId,\n        low_key: InlineArray,\n        location: NonZeroU64,\n    },\n    Free {\n        object_id: ObjectId,\n        collection_id: CollectionId,\n    },\n}\n\nimpl UpdateMetadata {\n    pub fn object_id(&self) -> ObjectId {\n        match self {\n            UpdateMetadata::Store { object_id, .. }\n            | UpdateMetadata::Free { object_id, .. } => *object_id,\n        }\n    }\n}\n\n#[derive(Debug, Default, Clone, Copy)]\nstruct WriteBatchStatTracker {\n    sum: WriteBatchStats,\n    max: WriteBatchStats,\n}\n\n#[derive(Clone)]\npub struct Heap {\n    path: PathBuf,\n    slabs: Arc<[Slab; N_SLABS]>,\n    table: ObjectLocationMapper,\n    metadata_store: Arc<Mutex<MetadataStore>>,\n    free_ebr: Ebr<DeferredFree, 16, 16>,\n    global_error: Arc<AtomicPtr<(io::ErrorKind, String)>>,\n    #[allow(unused)]\n    directory_lock: Arc<fs::File>,\n    stats: Arc<RwLock<WriteBatchStatTracker>>,\n    truncated_file_bytes: Arc<AtomicU64>,\n}\n\nimpl fmt::Debug for Heap {\n    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {\n        f.debug_struct(\"Heap\")\n            .field(\"path\", &self.path)\n            .field(\"stats\", &self.stats())\n            .finish()\n    }\n}\n\nimpl Heap {\n    pub fn recover(\n        leaf_fanout: usize,\n        config: &Config,\n    ) -> io::Result<HeapRecovery> {\n        let path = &config.path;\n        log::trace!(\"recovering Heap at {:?}\", path);\n        let slabs_dir = path.join(\"slabs\");\n\n        // TODO NOCOMMIT\n        let sync_status = std::process::Command::new(\"sync\")\n            .status()\n            .map(|status| status.success());\n\n        if !matches!(sync_status, Ok(true)) {\n            log::warn!(\n                \"sync command before recovery failed: {:?}\",\n                sync_status\n            );\n        }\n\n        // initialize directories if not present\n        let mut was_recovered = true;\n        for p in [path, &slabs_dir] {\n            if let Err(e) = fs::read_dir(p) {\n                if e.kind() == io::ErrorKind::NotFound {\n                    fallible!(fs::create_dir_all(p));\n                    was_recovered = false;\n                    continue;\n                }\n            }\n            maybe!(fs::File::open(p).and_then(|f| f.sync_all()))?;\n        }\n\n        let _ = fs::File::create(path.join(WARN));\n\n        let mut file_lock_opts = fs::OpenOptions::new();\n        file_lock_opts.create(false).read(false).write(false);\n        let directory_lock = fallible!(fs::File::open(path));\n        fallible!(directory_lock.try_lock_exclusive());\n\n        maybe!(fs::File::open(&slabs_dir).and_then(|f| f.sync_all()))?;\n        maybe!(directory_lock.sync_all())?;\n\n        let persistent_settings =\n            PersistentSettings::V1 { leaf_fanout: leaf_fanout as u64 };\n\n        persistent_settings.verify_or_store(path, &directory_lock)?;\n\n        let (metadata_store, recovered_metadata) =\n            MetadataStore::recover(path.join(\"metadata\"))?;\n\n        let table = ObjectLocationMapper::new(\n            &recovered_metadata,\n            config.target_heap_file_fill_ratio,\n        );\n\n        let mut recovered_nodes =\n            Vec::<ObjectRecovery>::with_capacity(recovered_metadata.len());\n\n        for update_metadata in recovered_metadata {\n            match update_metadata {\n                UpdateMetadata::Store {\n                    object_id,\n                    collection_id,\n                    location: _,\n                    low_key,\n                } => {\n                    recovered_nodes.push(ObjectRecovery {\n                        object_id,\n                        collection_id,\n                        low_key,\n                    });\n                }\n                UpdateMetadata::Free { .. } => {\n                    unreachable!()\n                }\n            }\n        }\n\n        let mut slabs = vec![];\n        let mut slab_opts = fs::OpenOptions::new();\n        slab_opts.create(true).read(true).write(true);\n        for slot_size in &SLAB_SIZES {\n            let slab_path = slabs_dir.join(format!(\"{}\", slot_size));\n\n            let file = fallible!(slab_opts.open(slab_path));\n\n            slabs.push(Slab {\n                slot_size: *slot_size,\n                file,\n                max_live_slot_since_last_truncation: AtomicU64::new(0),\n            })\n        }\n\n        maybe!(fs::File::open(&slabs_dir).and_then(|f| f.sync_all()))?;\n\n        log::debug!(\"recovery of Heap at {:?} complete\", path);\n\n        Ok(HeapRecovery {\n            heap: Heap {\n                slabs: Arc::new(slabs.try_into().unwrap()),\n                path: path.into(),\n                table,\n                global_error: metadata_store.get_global_error_arc(),\n                metadata_store: Arc::new(Mutex::new(metadata_store)),\n                directory_lock: Arc::new(directory_lock),\n                free_ebr: Ebr::default(),\n                truncated_file_bytes: Arc::default(),\n                stats: Arc::default(),\n            },\n            recovered_nodes,\n            was_recovered,\n        })\n    }\n\n    pub fn get_global_error_arc(\n        &self,\n    ) -> Arc<AtomicPtr<(io::ErrorKind, String)>> {\n        self.global_error.clone()\n    }\n\n    fn check_error(&self) -> io::Result<()> {\n        let err_ptr: *const (io::ErrorKind, String) =\n            self.global_error.load(Ordering::Acquire);\n\n        if err_ptr.is_null() {\n            Ok(())\n        } else {\n            let deref: &(io::ErrorKind, String) = unsafe { &*err_ptr };\n            Err(io::Error::new(deref.0, deref.1.clone()))\n        }\n    }\n\n    fn set_error(&self, error: &io::Error) {\n        set_error(&self.global_error, error);\n    }\n\n    pub fn manually_advance_epoch(&self) {\n        self.free_ebr.manually_advance_epoch();\n    }\n\n    pub fn stats(&self) -> HeapStats {\n        let truncated_file_bytes =\n            self.truncated_file_bytes.load(Ordering::Acquire);\n\n        let stats = self.stats.read();\n\n        HeapStats {\n            truncated_file_bytes,\n            allocator: self.table.stats(),\n            write_batch_max: stats.max,\n            write_batch_sum: stats.sum,\n        }\n    }\n\n    pub fn read(&self, object_id: ObjectId) -> Option<io::Result<Vec<u8>>> {\n        if let Err(e) = self.check_error() {\n            return Some(Err(e));\n        }\n\n        let mut guard = self.free_ebr.pin();\n        let slab_address = self.table.get_location_for_object(object_id)?;\n\n        let slab = &self.slabs[usize::from(slab_address.slab_id)];\n\n        match slab.read(slab_address.slot(), &mut guard) {\n            Ok(bytes) => Some(Ok(bytes)),\n            Err(e) => {\n                let annotated = annotate!(e);\n                self.set_error(&annotated);\n                Some(Err(annotated))\n            }\n        }\n    }\n\n    pub fn write_batch(\n        &self,\n        batch: Vec<Update>,\n    ) -> io::Result<WriteBatchStats> {\n        self.check_error()?;\n        let metadata_store = self.metadata_store.try_lock()\n            .expect(\"write_batch called concurrently! major correctness assumpiton violated\");\n        let mut guard = self.free_ebr.pin();\n\n        let slabs = &self.slabs;\n        let table = &self.table;\n\n        let heap_bytes_written = AtomicU64::new(0);\n        let heap_files_used_0_to_63 = AtomicU64::new(0);\n        let heap_files_used_64_to_127 = AtomicU64::new(0);\n\n        let map_closure = |update: Update| match update {\n            Update::Store { object_id, collection_id, low_key, data } => {\n                let data_len = data.len();\n                let slab_id = slab_for_size(data_len);\n                let slab = &slabs[usize::from(slab_id)];\n                let new_location = table.allocate_slab_slot(slab_id);\n                let new_location_nzu: NonZeroU64 = new_location.into();\n\n                let complete_durability_pipeline =\n                    maybe!(slab.write(new_location.slot(), data));\n\n                if let Err(e) = complete_durability_pipeline {\n                    // can immediately free slot as the\n                    table.free_slab_slot(new_location);\n                    return Err(e);\n                }\n\n                // record stats\n                heap_bytes_written\n                    .fetch_add(data_len as u64, Ordering::Release);\n\n                if slab_id < 64 {\n                    let slab_bit = 0b1 << slab_id;\n                    heap_files_used_0_to_63\n                        .fetch_or(slab_bit, Ordering::Release);\n                } else {\n                    assert!(slab_id < 128);\n                    let slab_bit = 0b1 << (slab_id - 64);\n                    heap_files_used_64_to_127\n                        .fetch_or(slab_bit, Ordering::Release);\n                }\n\n                Ok(UpdateMetadata::Store {\n                    object_id,\n                    collection_id,\n                    low_key,\n                    location: new_location_nzu,\n                })\n            }\n            Update::Free { object_id, collection_id } => {\n                Ok(UpdateMetadata::Free { object_id, collection_id })\n            }\n        };\n\n        let before_heap_write = Instant::now();\n\n        let metadata_batch_res: io::Result<Vec<UpdateMetadata>> =\n            batch.into_par_iter().map(map_closure).collect();\n\n        let before_heap_sync = Instant::now();\n\n        fence(Ordering::SeqCst);\n\n        for slab_id in 0..N_SLABS {\n            let dirty = if slab_id < 64 {\n                let slab_bit = 0b1 << slab_id;\n\n                heap_files_used_0_to_63.load(Ordering::Acquire) & slab_bit\n                    == slab_bit\n            } else {\n                let slab_bit = 0b1 << (slab_id - 64);\n\n                heap_files_used_64_to_127.load(Ordering::Acquire) & slab_bit\n                    == slab_bit\n            };\n\n            if dirty {\n                self.slabs[slab_id].sync()?;\n            }\n        }\n\n        let heap_sync_latency = before_heap_sync.elapsed();\n\n        let heap_write_latency = before_heap_write.elapsed();\n\n        let metadata_batch = match metadata_batch_res {\n            Ok(mut mb) => {\n                // TODO evaluate impact : cost ratio of this sort\n                mb.par_sort_unstable();\n                mb\n            }\n            Err(e) => {\n                self.set_error(&e);\n                return Err(e);\n            }\n        };\n\n        // make metadata durable\n        let before_metadata_write = Instant::now();\n        let metadata_bytes_written =\n            match metadata_store.write_batch(&metadata_batch) {\n                Ok(metadata_bytes_written) => metadata_bytes_written,\n                Err(e) => {\n                    self.set_error(&e);\n                    return Err(e);\n                }\n            };\n        let metadata_write_latency = before_metadata_write.elapsed();\n\n        // reclaim previous disk locations for future writes\n        for update_metadata in metadata_batch {\n            let last_address_opt = match update_metadata {\n                UpdateMetadata::Store { object_id, location, .. } => {\n                    self.table.insert(object_id, SlabAddress::from(location))\n                }\n                UpdateMetadata::Free { object_id, .. } => {\n                    guard.defer_drop(DeferredFree {\n                        allocator: self.table.clone_object_id_allocator_arc(),\n                        freed_slot: object_id.0.get(),\n                    });\n                    self.table.remove(object_id)\n                }\n            };\n\n            if let Some(last_address) = last_address_opt {\n                guard.defer_drop(DeferredFree {\n                    allocator: self\n                        .table\n                        .clone_slab_allocator_arc(last_address.slab_id),\n                    freed_slot: last_address.slot(),\n                });\n            }\n        }\n\n        // truncate files that are now too fragmented\n        let before_truncate = Instant::now();\n        let mut truncated_files = 0;\n        let mut truncated_bytes = 0;\n        for (i, max_live_slot) in self.table.get_max_allocated_per_slab() {\n            let slab = &self.slabs[i];\n\n            let last_max = slab\n                .max_live_slot_since_last_truncation\n                .fetch_max(max_live_slot, Ordering::SeqCst);\n\n            let max_since_last_truncation = last_max.max(max_live_slot);\n\n            let currently_occupied_bytes =\n                (max_live_slot + 1) * slab.slot_size as u64;\n\n            let max_occupied_bytes =\n                (max_since_last_truncation + 1) * slab.slot_size as u64;\n\n            let ratio = currently_occupied_bytes * 100 / max_occupied_bytes;\n\n            if ratio < FILE_TARGET_FILL_RATIO {\n                let target_len = if max_live_slot < 16 {\n                    currently_occupied_bytes\n                } else {\n                    currently_occupied_bytes * FILE_RESIZE_MARGIN / 100\n                };\n\n                assert!(target_len < max_occupied_bytes);\n                assert!(\n                    target_len >= currently_occupied_bytes,\n                    \"target_len of {} is above actual occupied len of {}\",\n                    target_len,\n                    currently_occupied_bytes\n                );\n\n                if cfg!(not(feature = \"monotonic-behavior\")) {\n                    if slab.file.set_len(target_len).is_ok() {\n                        slab.max_live_slot_since_last_truncation\n                            .store(max_live_slot, Ordering::SeqCst);\n\n                        let file_truncated_bytes =\n                            currently_occupied_bytes.saturating_sub(target_len);\n                        self.truncated_file_bytes\n                            .fetch_add(file_truncated_bytes, Ordering::Release);\n\n                        truncated_files += 1;\n                        truncated_bytes += file_truncated_bytes;\n                    } else {\n                        // TODO surface stats\n                    }\n                }\n            }\n        }\n\n        let truncate_latency = before_truncate.elapsed();\n\n        let heap_files_written_to = u64::from(\n            heap_files_used_0_to_63.load(Ordering::Acquire).count_ones()\n                + heap_files_used_64_to_127\n                    .load(Ordering::Acquire)\n                    .count_ones(),\n        );\n\n        let stats = WriteBatchStats {\n            heap_bytes_written: heap_bytes_written.load(Ordering::Acquire),\n            heap_files_written_to,\n            heap_write_latency,\n            heap_sync_latency,\n            metadata_bytes_written,\n            metadata_write_latency,\n            truncated_files,\n            truncated_bytes,\n            truncate_latency,\n        };\n\n        {\n            let mut stats_tracker = self.stats.write();\n            stats_tracker.max = stats_tracker.max.max(&stats);\n            stats_tracker.sum = stats_tracker.sum.sum(&stats);\n        }\n\n        Ok(stats)\n    }\n\n    pub fn heap_object_id_pin(&self) -> ebr::Guard<'_, DeferredFree, 16, 16> {\n        self.free_ebr.pin()\n    }\n\n    pub fn allocate_object_id(&self) -> ObjectId {\n        self.table.allocate_object_id()\n    }\n\n    pub(crate) fn objects_to_defrag(&self) -> FnvHashSet<ObjectId> {\n        self.table.objects_to_defrag()\n    }\n}\n"
  },
  {
    "path": "src/id_allocator.rs",
    "content": "use std::collections::BTreeSet;\nuse std::sync::atomic::{AtomicU64, Ordering};\nuse std::sync::Arc;\n\nuse crossbeam_queue::SegQueue;\nuse fnv::FnvHashSet;\nuse parking_lot::Mutex;\n\n#[derive(Default, Debug)]\nstruct FreeSetAndTip {\n    free_set: BTreeSet<u64>,\n    next_to_allocate: u64,\n}\n\n#[derive(Default, Debug)]\npub struct Allocator {\n    free_and_pending: Mutex<FreeSetAndTip>,\n    /// Flat combining.\n    ///\n    /// A lock free queue of recently freed ids which uses when there is contention on `free_and_pending`.\n    free_queue: SegQueue<u64>,\n    allocation_counter: AtomicU64,\n    free_counter: AtomicU64,\n}\n\nimpl Allocator {\n    /// Intended primarily for heap slab slot allocators when performing GC.\n    ///\n    /// If the slab is fragmented beyond the desired fill ratio, this returns\n    /// the range of offsets (min inclusive, max exclusive) that may be copied\n    /// into earlier free slots if they are currently occupied in order to\n    /// achieve the desired fragmentation ratio.\n    pub fn fragmentation_cutoff(\n        &self,\n        desired_ratio: f32,\n    ) -> Option<(u64, u64)> {\n        let mut free_and_tip = self.free_and_pending.lock();\n\n        let next_to_allocate = free_and_tip.next_to_allocate;\n\n        if next_to_allocate == 0 {\n            return None;\n        }\n\n        while let Some(free_id) = self.free_queue.pop() {\n            free_and_tip.free_set.insert(free_id);\n        }\n\n        let live_objects =\n            next_to_allocate - free_and_tip.free_set.len() as u64;\n        let actual_ratio = live_objects as f32 / next_to_allocate as f32;\n\n        log::trace!(\n            \"fragmented_slots actual ratio: {actual_ratio}, free len: {}\",\n            free_and_tip.free_set.len()\n        );\n\n        if desired_ratio <= actual_ratio {\n            return None;\n        }\n\n        // calculate theoretical cut-off point, return everything past that\n        let min = (live_objects as f32 / desired_ratio) as u64;\n        let max = next_to_allocate;\n        assert!(min < max);\n        Some((min, max))\n    }\n\n    pub fn from_allocated(allocated: &FnvHashSet<u64>) -> Allocator {\n        let mut heap = BTreeSet::<u64>::default();\n        let max = allocated.iter().copied().max();\n\n        for i in 0..max.unwrap_or(0) {\n            if !allocated.contains(&i) {\n                heap.insert(i);\n            }\n        }\n\n        let free_and_pending = Mutex::new(FreeSetAndTip {\n            free_set: heap,\n            next_to_allocate: max.map(|m| m + 1).unwrap_or(0),\n        });\n\n        Allocator {\n            free_and_pending,\n            free_queue: SegQueue::default(),\n            allocation_counter: 0.into(),\n            free_counter: 0.into(),\n        }\n    }\n\n    pub fn max_allocated(&self) -> Option<u64> {\n        let next = self.free_and_pending.lock().next_to_allocate;\n\n        if next == 0 {\n            None\n        } else {\n            Some(next - 1)\n        }\n    }\n\n    pub fn allocate(&self) -> u64 {\n        self.allocation_counter.fetch_add(1, Ordering::Relaxed);\n        let mut free_and_tip = self.free_and_pending.lock();\n        while let Some(free_id) = self.free_queue.pop() {\n            free_and_tip.free_set.insert(free_id);\n        }\n\n        compact(&mut free_and_tip);\n\n        let pop_attempt = free_and_tip.free_set.pop_first();\n\n        if let Some(id) = pop_attempt {\n            id\n        } else {\n            let ret = free_and_tip.next_to_allocate;\n            free_and_tip.next_to_allocate += 1;\n            ret\n        }\n    }\n\n    pub fn free(&self, id: u64) {\n        if cfg!(not(feature = \"monotonic-behavior\")) {\n            self.free_counter.fetch_add(1, Ordering::Relaxed);\n            if let Some(mut free) = self.free_and_pending.try_lock() {\n                while let Some(free_id) = self.free_queue.pop() {\n                    free.free_set.insert(free_id);\n                }\n                free.free_set.insert(id);\n\n                compact(&mut free);\n            } else {\n                self.free_queue.push(id);\n            }\n        }\n    }\n\n    /// Returns the counters for allocated, free\n    pub fn counters(&self) -> (u64, u64) {\n        (\n            self.allocation_counter.load(Ordering::Acquire),\n            self.free_counter.load(Ordering::Acquire),\n        )\n    }\n}\n\nfn compact(free: &mut FreeSetAndTip) {\n    let next = &mut free.next_to_allocate;\n\n    while *next > 1 && free.free_set.contains(&(*next - 1)) {\n        free.free_set.remove(&(*next - 1));\n        *next -= 1;\n    }\n}\n\npub struct DeferredFree {\n    pub allocator: Arc<Allocator>,\n    pub freed_slot: u64,\n}\n\nimpl Drop for DeferredFree {\n    fn drop(&mut self) {\n        self.allocator.free(self.freed_slot)\n    }\n}\n"
  },
  {
    "path": "src/leaf.rs",
    "content": "use crate::*;\n\n#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]\npub(crate) struct Leaf<const LEAF_FANOUT: usize> {\n    pub lo: InlineArray,\n    pub hi: Option<InlineArray>,\n    pub prefix_length: usize,\n    data: stack_map::StackMap<InlineArray, InlineArray, LEAF_FANOUT>,\n    pub in_memory_size: usize,\n    pub mutation_count: u64,\n    #[serde(skip)]\n    pub dirty_flush_epoch: Option<FlushEpoch>,\n    #[serde(skip)]\n    pub page_out_on_flush: Option<FlushEpoch>,\n    #[serde(skip)]\n    pub deleted: Option<FlushEpoch>,\n    #[serde(skip)]\n    pub max_unflushed_epoch: Option<FlushEpoch>,\n}\n\nimpl<const LEAF_FANOUT: usize> Leaf<LEAF_FANOUT> {\n    pub(crate) fn empty() -> Leaf<LEAF_FANOUT> {\n        Leaf {\n            lo: InlineArray::default(),\n            hi: None,\n            prefix_length: 0,\n            data: stack_map::StackMap::default(),\n            // this does not need to be marked as dirty until it actually\n            // receives inserted data\n            dirty_flush_epoch: None,\n            in_memory_size: std::mem::size_of::<Leaf<LEAF_FANOUT>>(),\n            mutation_count: 0,\n            page_out_on_flush: None,\n            deleted: None,\n            max_unflushed_epoch: None,\n        }\n    }\n\n    pub(crate) const fn is_empty(&self) -> bool {\n        self.data.is_empty()\n    }\n\n    pub(crate) fn set_dirty_epoch(&mut self, epoch: FlushEpoch) {\n        assert!(self.deleted.is_none());\n        if let Some(current_epoch) = self.dirty_flush_epoch {\n            assert!(current_epoch <= epoch);\n        }\n        if self.page_out_on_flush < Some(epoch) {\n            self.page_out_on_flush = None;\n        }\n        self.dirty_flush_epoch = Some(epoch);\n    }\n\n    fn prefix(&self) -> &[u8] {\n        assert!(self.deleted.is_none());\n        &self.lo[..self.prefix_length]\n    }\n\n    pub(crate) fn get(&self, key: &[u8]) -> Option<&InlineArray> {\n        assert!(self.deleted.is_none());\n        assert!(key.starts_with(self.prefix()));\n        let prefixed_key = &key[self.prefix_length..];\n        self.data.get(prefixed_key)\n    }\n\n    pub(crate) fn insert(\n        &mut self,\n        key: InlineArray,\n        value: InlineArray,\n    ) -> Option<InlineArray> {\n        assert!(self.deleted.is_none());\n        assert!(key.starts_with(self.prefix()));\n        let prefixed_key = key[self.prefix_length..].into();\n        self.data.insert(prefixed_key, value)\n    }\n\n    pub(crate) fn remove(&mut self, key: &[u8]) -> Option<InlineArray> {\n        assert!(self.deleted.is_none());\n        let prefix = self.prefix();\n        assert!(key.starts_with(prefix));\n        let partial_key = &key[self.prefix_length..];\n        self.data.remove(partial_key)\n    }\n\n    pub(crate) fn merge_from(&mut self, other: &mut Self) {\n        assert!(self.is_empty());\n\n        self.hi = other.hi.clone();\n\n        let new_prefix_len = if let Some(hi) = &self.hi {\n            self.lo.iter().zip(hi.iter()).take_while(|(l, r)| l == r).count()\n        } else {\n            0\n        };\n\n        assert_eq!(self.lo[..new_prefix_len], other.lo[..new_prefix_len]);\n\n        // self.prefix_length is not read because it's expected to be\n        // initialized here.\n        self.prefix_length = new_prefix_len;\n\n        if self.prefix() == other.prefix() {\n            self.data = std::mem::take(&mut other.data);\n            return;\n        }\n\n        assert!(\n            self.prefix_length < other.prefix_length,\n            \"self: {:?} other: {:?}\",\n            self,\n            other\n        );\n\n        let unshifted_key_amount = other.prefix_length - self.prefix_length;\n        let unshifted_prefix = &other.lo\n            [other.prefix_length - unshifted_key_amount..other.prefix_length];\n\n        for (k, v) in other.data.iter() {\n            let mut unshifted_key =\n                Vec::with_capacity(unshifted_prefix.len() + k.len());\n            unshifted_key.extend_from_slice(unshifted_prefix);\n            unshifted_key.extend_from_slice(k);\n            self.data.insert(unshifted_key.into(), v.clone());\n        }\n\n        assert_eq!(other.data.len(), self.data.len());\n\n        #[cfg(feature = \"for-internal-testing-only\")]\n        assert_eq!(\n            self.iter().collect::<Vec<_>>(),\n            other.iter().collect::<Vec<_>>(),\n            \"self: {:#?} \\n other: {:#?}\\n\",\n            self,\n            other\n        );\n    }\n\n    pub(crate) fn iter(\n        &self,\n    ) -> impl Iterator<Item = (InlineArray, InlineArray)> {\n        let prefix = self.prefix();\n        self.data.iter().map(|(k, v)| {\n            let mut unshifted_key = Vec::with_capacity(prefix.len() + k.len());\n            unshifted_key.extend_from_slice(prefix);\n            unshifted_key.extend_from_slice(k);\n            (unshifted_key.into(), v.clone())\n        })\n    }\n\n    pub(crate) fn serialize(&self, zstd_compression_level: i32) -> Vec<u8> {\n        let mut ret = vec![];\n\n        let mut zstd_enc =\n            zstd::stream::Encoder::new(&mut ret, zstd_compression_level)\n                .unwrap();\n\n        bincode::serialize_into(&mut zstd_enc, self).unwrap();\n\n        zstd_enc.finish().unwrap();\n\n        ret\n    }\n\n    pub(crate) fn deserialize(\n        buf: &[u8],\n    ) -> std::io::Result<Box<Leaf<LEAF_FANOUT>>> {\n        let zstd_decoded = zstd::stream::decode_all(buf).unwrap();\n        let mut leaf: Box<Leaf<LEAF_FANOUT>> =\n            bincode::deserialize(&zstd_decoded).unwrap();\n\n        // use decompressed buffer length as a cheap proxy for in-memory size for now\n        leaf.in_memory_size = zstd_decoded.len();\n\n        Ok(leaf)\n    }\n\n    fn set_in_memory_size(&mut self) {\n        self.in_memory_size = std::mem::size_of::<Leaf<LEAF_FANOUT>>()\n            + self.hi.as_ref().map(|h| h.len()).unwrap_or(0)\n            + self.lo.len()\n            + self.data.iter().map(|(k, v)| k.len() + v.len()).sum::<usize>();\n    }\n\n    pub(crate) fn split_if_full(\n        &mut self,\n        new_epoch: FlushEpoch,\n        allocator: &ObjectCache<LEAF_FANOUT>,\n        collection_id: CollectionId,\n    ) -> Option<(InlineArray, Object<LEAF_FANOUT>)> {\n        if self.data.is_full() {\n            let original_len = self.data.len();\n\n            let old_prefix_len = self.prefix_length;\n            // split\n            let split_offset = if self.lo.is_empty() {\n                // split left-most shard almost at the beginning for\n                // optimizing downward-growing workloads\n                1\n            } else if self.hi.is_none() {\n                // split right-most shard almost at the end for\n                // optimizing upward-growing workloads\n                self.data.len() - 2\n            } else {\n                self.data.len() / 2\n            };\n\n            let data = self.data.split_off(split_offset);\n\n            let left_max = &self.data.last().unwrap().0;\n            let right_min = &data.first().unwrap().0;\n\n            // suffix truncation attempts to shrink the split key\n            // so that shorter keys bubble up into the index\n            let splitpoint_length = right_min\n                .iter()\n                .zip(left_max.iter())\n                .take_while(|(a, b)| a == b)\n                .count()\n                + 1;\n\n            let mut split_vec =\n                Vec::with_capacity(self.prefix_length + splitpoint_length);\n            split_vec.extend_from_slice(self.prefix());\n            split_vec.extend_from_slice(&right_min[..splitpoint_length]);\n            let split_key = InlineArray::from(split_vec);\n\n            let rhs_id = allocator.allocate_object_id(new_epoch);\n\n            log::trace!(\n                \"split leaf {:?} at split key: {:?} into new {:?} at {:?}\",\n                self.lo,\n                split_key,\n                rhs_id,\n                new_epoch,\n            );\n\n            let mut rhs = Leaf {\n                dirty_flush_epoch: Some(new_epoch),\n                hi: self.hi.clone(),\n                lo: split_key.clone(),\n                prefix_length: 0,\n                in_memory_size: 0,\n                data,\n                mutation_count: 0,\n                page_out_on_flush: None,\n                deleted: None,\n                max_unflushed_epoch: None,\n            };\n\n            rhs.shorten_keys_after_split(old_prefix_len);\n\n            rhs.set_in_memory_size();\n\n            self.hi = Some(split_key.clone());\n\n            self.shorten_keys_after_split(old_prefix_len);\n\n            self.set_in_memory_size();\n\n            assert_eq!(self.hi.as_ref().unwrap(), &split_key);\n            assert_eq!(rhs.lo, &split_key);\n            assert_eq!(rhs.data.len() + self.data.len(), original_len);\n\n            let rhs_node = Object {\n                object_id: rhs_id,\n                collection_id,\n                low_key: split_key.clone(),\n                inner: Arc::new(RwLock::new(CacheBox {\n                    leaf: Some(Box::new(rhs)),\n                    logged_index: BTreeMap::default(),\n                })),\n            };\n\n            return Some((split_key, rhs_node));\n        }\n\n        None\n    }\n\n    pub(crate) fn shorten_keys_after_split(&mut self, old_prefix_len: usize) {\n        let Some(hi) = self.hi.as_ref() else { return };\n\n        let new_prefix_len =\n            self.lo.iter().zip(hi.iter()).take_while(|(l, r)| l == r).count();\n\n        assert_eq!(self.lo[..new_prefix_len], hi[..new_prefix_len]);\n\n        // self.prefix_length is not read because it's expected to be\n        // initialized here.\n        self.prefix_length = new_prefix_len;\n\n        if new_prefix_len == old_prefix_len {\n            return;\n        }\n\n        assert!(\n            new_prefix_len > old_prefix_len,\n            \"expected new prefix length of {} to be greater than the pre-split prefix length of {} for node {:?}\",\n            new_prefix_len,\n            old_prefix_len,\n            self\n        );\n\n        let key_shift = new_prefix_len - old_prefix_len;\n\n        for (k, v) in std::mem::take(&mut self.data).iter() {\n            self.data.insert(k[key_shift..].into(), v.clone());\n        }\n    }\n}\n"
  },
  {
    "path": "src/lib.rs",
    "content": "// 1.0 blockers\n//\n// bugs\n// * page-out needs to be deferred until after any flush of the dirty epoch\n//   * need to remove max_unflushed_epoch after flushing it\n//   * can't send reliable page-out request backwards from 7->6\n//   * re-locking every mutex in a writebatch feels bad\n//   * need to signal stability status forward\n//     * maybe we already are\n//   * can make dirty_flush_epoch atomic and CAS it to 0 after flush\n//   * can change dirty_flush_epoch to unflushed_epoch\n//   * can always set mutation_count to max dirty flush epoch\n//     * this feels nice, we can lazily update a global stable flushed counter\n//     * can get rid of dirty_flush_epoch and page_out_on_flush?\n//     * or at least dirty_flush_epoch\n//   * dirty_flush_epoch really means \"hasn't yet been cooperatively serialized @ F.E.\"\n//   * interesting metrics:\n//     * whether dirty for some epoch\n//     * whether cooperatively serialized for some epoch\n//     * whether fully flushed for some epoch\n//     * clean -> dirty -> {maybe coop} -> flushed\n//   * for page-out, we only care if it's stable or if we need to add it to\n//     a page-out priority queue\n// * page-out doesn't seem to happen as expected\n//\n// reliability\n// TODO make all writes wrapped in a Tearable wrapper that splits writes\n//      and can possibly crash based on a counter.\n// TODO test concurrent drop_tree when other threads are still using it\n// TODO list trees test for recovering empty collections\n// TODO set explicit max key and value sizes w/ corresponding heap\n// TODO add failpoints to writepath\n//\n// performance\n// TODO handle prefix encoding\n// TODO (minor) remove cache access for removed node in merge function\n// TODO index+log hybrid - tinylsm key -> object location\n//\n// features\n// TODO multi-collection batch\n//\n// misc\n// TODO skim inlining output of RUSTFLAGS=\"-Cremark=all -Cdebuginfo=1\"\n//\n// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1.0 cutoff ~~~~~~~~~~~~~~~~~~~~~~~~~~~\n//\n// post-1.0 improvements\n//\n// reliability\n// TODO bug hiding: if the crash_iter test panics, the test doesn't fail as expected\n// TODO event log assertion for testing heap location bidirectional referential integrity,\n//      particularly in the object location mapper.\n// TODO ensure nothing \"from the future\" gets copied into earlier epochs during GC\n// TODO collection_id on page_in checks - it needs to be pinned w/ heap's EBR?\n// TODO put aborts behind feature flags for hard crashes\n// TODO re-enable transaction tests in test_tree.rs\n//\n// performance\n// TODO force writers to flush when some number of dirty epochs have built up\n// TODO serialize flush batch in parallel\n// TODO concurrent serialization of NotYetSerialized dirty objects\n// TODO make the Arc<Option<Box<Leaf just a single pointer chase w/ custom container\n// TODO allow waiting flusher to start collecting dirty pages as soon\n//      as it is evacuated - just wait until last flush is done before\n//      we persist the batch\n// TODO measure space savings vs cost of zstd in metadata store\n// TODO make EBR and index fanout consts as small as possible to reduce memory usage\n// TODO make leaf fanout as small as possible while retaining perf\n// TODO dynamically sized fanouts for reducing fragmentation\n//\n// features\n// TODO transactions\n// TODO implement create exclusive\n// TODO temporary trees for transactional in-memory coordination\n// TODO corrupted data extraction binary\n//\n\n//! `sled` is a high-performance embedded database with\n//! an API that is similar to a `BTreeMap<[u8], [u8]>`,\n//! but with several additional capabilities for\n//! assisting creators of stateful systems.\n//!\n//! It is fully thread-safe, and all operations are\n//! atomic. Multiple `Tree`s with isolated keyspaces\n//! are supported with the\n//! [`Db::open_tree`](struct.Db.html#method.open_tree) method.\n//!\n//! `sled` is built by experienced database engineers\n//! who think users should spend less time tuning and\n//! working against high-friction APIs. Expect\n//! significant ergonomic and performance improvements\n//! over time. Most surprises are bugs, so please\n//! [let us know](mailto:tylerneely@gmail.com?subject=sled%20sucks!!!)\n//! if something is high friction.\n//!\n//! # Examples\n//!\n//! ```\n//! # let _ = std::fs::remove_dir_all(\"my_db\");\n//! let db: sled::Db = sled::open(\"my_db\").unwrap();\n//!\n//! // insert and get\n//! db.insert(b\"yo!\", b\"v1\");\n//! assert_eq!(&db.get(b\"yo!\").unwrap().unwrap(), b\"v1\");\n//!\n//! // Atomic compare-and-swap.\n//! db.compare_and_swap(\n//!     b\"yo!\",      // key\n//!     Some(b\"v1\"), // old value, None for not present\n//!     Some(b\"v2\"), // new value, None for delete\n//! )\n//! .unwrap();\n//!\n//! // Iterates over key-value pairs, starting at the given key.\n//! let scan_key: &[u8] = b\"a non-present key before yo!\";\n//! let mut iter = db.range(scan_key..);\n//! assert_eq!(&iter.next().unwrap().unwrap().0, b\"yo!\");\n//! assert!(iter.next().is_none());\n//!\n//! db.remove(b\"yo!\");\n//! assert!(db.get(b\"yo!\").unwrap().is_none());\n//!\n//! let other_tree: sled::Tree = db.open_tree(b\"cool db facts\").unwrap();\n//! other_tree.insert(\n//!     b\"k1\",\n//!     &b\"a Db acts like a Tree due to implementing Deref<Target = Tree>\"[..]\n//! ).unwrap();\n//! # let _ = std::fs::remove_dir_all(\"my_db\");\n//! ```\n#[cfg(feature = \"for-internal-testing-only\")]\nmod block_checker;\nmod config;\nmod db;\nmod flush_epoch;\nmod heap;\nmod id_allocator;\nmod leaf;\nmod metadata_store;\nmod object_cache;\nmod object_location_mapper;\nmod tree;\n\n#[cfg(any(\n    feature = \"testing-shred-allocator\",\n    feature = \"testing-count-allocator\"\n))]\npub mod alloc;\n\n#[cfg(feature = \"for-internal-testing-only\")]\nmod event_verifier;\n\n#[inline]\nfn debug_delay() {\n    #[cfg(debug_assertions)]\n    {\n        let rand =\n            std::time::SystemTime::UNIX_EPOCH.elapsed().unwrap().as_nanos();\n\n        if rand % 128 > 100 {\n            for _ in 0..rand % 16 {\n                std::thread::yield_now();\n            }\n        }\n    }\n}\n\npub use crate::config::Config;\npub use crate::db::Db;\npub use crate::tree::{Batch, Iter, Tree};\npub use inline_array::InlineArray;\n\nconst NAME_MAPPING_COLLECTION_ID: CollectionId = CollectionId(0);\nconst DEFAULT_COLLECTION_ID: CollectionId = CollectionId(1);\nconst INDEX_FANOUT: usize = 64;\nconst EBR_LOCAL_GC_BUFFER_SIZE: usize = 128;\n\nuse std::collections::BTreeMap;\nuse std::num::NonZeroU64;\nuse std::ops::Bound;\nuse std::sync::Arc;\n\nuse parking_lot::RwLock;\n\nuse crate::flush_epoch::{\n    FlushEpoch, FlushEpochGuard, FlushEpochTracker, FlushInvariants,\n};\nuse crate::heap::{\n    HeapStats, ObjectRecovery, SlabAddress, Update, WriteBatchStats,\n};\nuse crate::id_allocator::{Allocator, DeferredFree};\nuse crate::leaf::Leaf;\n\n// These are public so that they can be easily crash tested in external\n// binaries. They are hidden because there are zero guarantees around their\n// API stability or functionality.\n#[doc(hidden)]\npub use crate::heap::{Heap, HeapRecovery};\n#[doc(hidden)]\npub use crate::metadata_store::MetadataStore;\n#[doc(hidden)]\npub use crate::object_cache::{CacheStats, Dirty, FlushStats, ObjectCache};\n\n/// Opens a `Db` with a default configuration at the\n/// specified path. This will create a new storage\n/// directory at the specified path if it does\n/// not already exist. You can use the `Db::was_recovered`\n/// method to determine if your database was recovered\n/// from a previous instance.\npub fn open<P: AsRef<std::path::Path>>(path: P) -> std::io::Result<Db> {\n    Config::new().path(path).open()\n}\n\n#[derive(Debug, Copy, Clone)]\npub struct Stats {\n    pub cache: CacheStats,\n}\n\n/// Compare and swap result.\n///\n/// It returns `Ok(Ok(()))` if operation finishes successfully and\n///     - `Ok(Err(CompareAndSwapError(current, proposed)))` if operation failed\n///       to setup a new value. `CompareAndSwapError` contains current and\n///       proposed values.\n///     - `Err(Error::Unsupported)` if the database is opened in read-only mode.\n///       otherwise.\npub type CompareAndSwapResult = std::io::Result<\n    std::result::Result<CompareAndSwapSuccess, CompareAndSwapError>,\n>;\n\ntype Index<const LEAF_FANOUT: usize> = concurrent_map::ConcurrentMap<\n    InlineArray,\n    Object<LEAF_FANOUT>,\n    INDEX_FANOUT,\n    EBR_LOCAL_GC_BUFFER_SIZE,\n>;\n\n/// Compare and swap error.\n#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]\npub struct CompareAndSwapError {\n    /// The current value which caused your CAS to fail.\n    pub current: Option<InlineArray>,\n    /// Returned value that was proposed unsuccessfully.\n    pub proposed: Option<InlineArray>,\n}\n\n/// Compare and swap success.\n#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]\npub struct CompareAndSwapSuccess {\n    /// The current value which was successfully installed.\n    pub new_value: Option<InlineArray>,\n    /// Returned value that was previously stored.\n    pub previous_value: Option<InlineArray>,\n}\n\nimpl std::fmt::Display for CompareAndSwapError {\n    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {\n        write!(f, \"Compare and swap conflict\")\n    }\n}\n\nimpl std::error::Error for CompareAndSwapError {}\n\n#[derive(\n    Debug,\n    Clone,\n    Copy,\n    serde::Serialize,\n    serde::Deserialize,\n    PartialOrd,\n    Ord,\n    PartialEq,\n    Eq,\n    Hash,\n)]\npub struct ObjectId(NonZeroU64);\n\nimpl ObjectId {\n    fn new(from: u64) -> Option<ObjectId> {\n        NonZeroU64::new(from).map(ObjectId)\n    }\n}\n\nimpl std::ops::Deref for ObjectId {\n    type Target = u64;\n\n    fn deref(&self) -> &u64 {\n        let self_ref: &NonZeroU64 = &self.0;\n\n        // NonZeroU64 is repr(transparent) where it wraps a u64\n        // so it is guaranteed to match the binary layout. This\n        // makes it safe to cast a reference to one as a reference\n        // to the other like this.\n        let self_ptr: *const NonZeroU64 = self_ref as *const _;\n        let reference: *const u64 = self_ptr as *const u64;\n\n        unsafe { &*reference }\n    }\n}\n\nimpl concurrent_map::Minimum for ObjectId {\n    const MIN: ObjectId = ObjectId(NonZeroU64::MIN);\n}\n\n#[derive(\n    Debug,\n    Clone,\n    Copy,\n    serde::Serialize,\n    serde::Deserialize,\n    PartialOrd,\n    Ord,\n    PartialEq,\n    Eq,\n    Hash,\n)]\npub struct CollectionId(u64);\n\nimpl concurrent_map::Minimum for CollectionId {\n    const MIN: CollectionId = CollectionId(u64::MIN);\n}\n\n#[derive(Debug, Clone)]\nstruct CacheBox<const LEAF_FANOUT: usize> {\n    leaf: Option<Box<Leaf<LEAF_FANOUT>>>,\n    #[allow(unused)]\n    logged_index: BTreeMap<InlineArray, LogValue>,\n}\n\n#[allow(unused)]\n#[derive(Debug, Clone)]\nstruct LogValue {\n    location: SlabAddress,\n    value: Option<InlineArray>,\n}\n\n#[derive(Debug, Clone)]\npub struct Object<const LEAF_FANOUT: usize> {\n    object_id: ObjectId,\n    collection_id: CollectionId,\n    low_key: InlineArray,\n    inner: Arc<RwLock<CacheBox<LEAF_FANOUT>>>,\n}\n\nimpl<const LEAF_FANOUT: usize> PartialEq for Object<LEAF_FANOUT> {\n    fn eq(&self, other: &Self) -> bool {\n        self.object_id == other.object_id\n    }\n}\n\n/// Stored on `Db` and `Tree` in an Arc, so that when the\n/// last \"high-level\" struct is dropped, the flusher thread\n/// is cleaned up.\nstruct ShutdownDropper<const LEAF_FANOUT: usize> {\n    shutdown_sender: parking_lot::Mutex<\n        std::sync::mpsc::Sender<std::sync::mpsc::Sender<()>>,\n    >,\n    cache: parking_lot::Mutex<object_cache::ObjectCache<LEAF_FANOUT>>,\n}\n\nimpl<const LEAF_FANOUT: usize> Drop for ShutdownDropper<LEAF_FANOUT> {\n    fn drop(&mut self) {\n        let (tx, rx) = std::sync::mpsc::channel();\n        log::debug!(\"sending shutdown signal to flusher\");\n        if self.shutdown_sender.lock().send(tx).is_ok() {\n            if let Err(e) = rx.recv() {\n                log::error!(\"failed to shut down flusher thread: {:?}\", e);\n            } else {\n                log::debug!(\"flush thread successfully terminated\");\n            }\n        } else {\n            log::debug!(\n                \"failed to shut down flusher, manually flushing ObjectCache\"\n            );\n            let cache = self.cache.lock();\n            if let Err(e) = cache.flush() {\n                log::error!(\n                    \"Db flusher encountered error while flushing: {:?}\",\n                    e\n                );\n                cache.set_error(&e);\n            }\n        }\n    }\n}\n\nfn map_bound<T, U, F: FnOnce(T) -> U>(bound: Bound<T>, f: F) -> Bound<U> {\n    match bound {\n        Bound::Unbounded => Bound::Unbounded,\n        Bound::Included(x) => Bound::Included(f(x)),\n        Bound::Excluded(x) => Bound::Excluded(f(x)),\n    }\n}\n\nconst fn _assert_public_types_send_sync() {\n    use std::fmt::Debug;\n\n    const fn _assert_send<S: Send + Clone + Debug>() {}\n\n    const fn _assert_send_sync<S: Send + Sync + Clone + Debug>() {}\n\n    /*\n    _assert_send::<Subscriber>();\n    _assert_send_sync::<Event>();\n    _assert_send_sync::<Mode>();\n    _assert_send_sync::<Tree>();\n    */\n\n    _assert_send::<Db>();\n\n    _assert_send_sync::<Batch>();\n    _assert_send_sync::<InlineArray>();\n    _assert_send_sync::<Config>();\n    _assert_send_sync::<CompareAndSwapSuccess>();\n    _assert_send_sync::<CompareAndSwapError>();\n}\n"
  },
  {
    "path": "src/metadata_store.rs",
    "content": "use std::collections::BTreeSet;\nuse std::fs;\nuse std::io::{self, Read, Write};\nuse std::num::NonZeroU64;\nuse std::path::{Path, PathBuf};\nuse std::sync::{\n    Arc,\n    atomic::{AtomicPtr, AtomicU64, Ordering},\n};\n\nuse crossbeam_channel::{Receiver, Sender, bounded, unbounded};\nuse fault_injection::{annotate, fallible, maybe};\nuse fnv::FnvHashMap;\nuse inline_array::InlineArray;\nuse parking_lot::Mutex;\nuse rayon::prelude::*;\nuse zstd::stream::read::Decoder as ZstdDecoder;\nuse zstd::stream::write::Encoder as ZstdEncoder;\n\nuse crate::{CollectionId, ObjectId, heap::UpdateMetadata};\n\nconst WARN: &str = \"DO_NOT_PUT_YOUR_FILES_HERE\";\nconst TMP_SUFFIX: &str = \".tmp\";\nconst LOG_PREFIX: &str = \"log\";\nconst SNAPSHOT_PREFIX: &str = \"snapshot\";\n\nconst ZSTD_LEVEL: i32 = 3;\n\n// NB: intentionally does not implement Clone, and\n// the Inner::drop code relies on this invariant for\n// now so that we don't free the global error until\n// all high-level structs are dropped. This is not\n// hard to change over time though, just a current\n// invariant.\npub struct MetadataStore {\n    inner: Inner,\n    is_shut_down: bool,\n}\n\nimpl Drop for MetadataStore {\n    fn drop(&mut self) {\n        if self.is_shut_down {\n            return;\n        }\n\n        self.shutdown_inner();\n        self.is_shut_down = true;\n    }\n}\n\nstruct MetadataRecovery {\n    recovered: Vec<UpdateMetadata>,\n    id_for_next_log: u64,\n    snapshot_size: u64,\n}\n\nstruct LogAndStats {\n    file: fs::File,\n    bytes_written: u64,\n    log_sequence_number: u64,\n}\n\nenum WorkerMessage {\n    Shutdown(Sender<()>),\n    LogReadyToCompact { log_and_stats: LogAndStats },\n}\n\nfn get_compactions(\n    rx: &mut Receiver<WorkerMessage>,\n) -> Result<Vec<u64>, Option<Sender<()>>> {\n    let mut ret = vec![];\n\n    match rx.recv() {\n        Ok(WorkerMessage::Shutdown(tx)) => {\n            return Err(Some(tx));\n        }\n        Ok(WorkerMessage::LogReadyToCompact { log_and_stats }) => {\n            ret.push(log_and_stats.log_sequence_number);\n        }\n        Err(e) => {\n            log::error!(\n                \"metadata store worker thread unable to receive message, unexpected shutdown: {e:?}\"\n            );\n            return Err(None);\n        }\n    }\n\n    // scoop up any additional logs that have built up while we were busy compacting\n    loop {\n        match rx.try_recv() {\n            Ok(WorkerMessage::Shutdown(tx)) => {\n                tx.send(()).unwrap();\n                return Err(Some(tx));\n            }\n            Ok(WorkerMessage::LogReadyToCompact { log_and_stats }) => {\n                ret.push(log_and_stats.log_sequence_number);\n            }\n            Err(_timeout) => return Ok(ret),\n        }\n    }\n}\n\nfn worker(\n    mut rx: Receiver<WorkerMessage>,\n    mut last_snapshot_lsn: u64,\n    inner: Inner,\n) {\n    loop {\n        if let Err(error) = check_error(&inner.global_error) {\n            drop(inner);\n\n            log::error!(\n                \"compaction thread terminating after global error set to {:?}\",\n                error\n            );\n\n            return;\n        }\n\n        match get_compactions(&mut rx) {\n            Ok(log_ids) => {\n                assert_eq!(log_ids[0], last_snapshot_lsn + 1);\n\n                let write_res = read_snapshot_and_apply_logs(\n                    &inner.storage_directory,\n                    log_ids.into_iter().collect(),\n                    Some(last_snapshot_lsn),\n                    &inner.directory_lock,\n                );\n                match write_res {\n                    Err(e) => {\n                        set_error(&inner.global_error, &e);\n                        log::error!(\n                            \"log compactor thread encountered error: {:?} - setting global fatal error and shutting down compactions\",\n                            e\n                        );\n                        return;\n                    }\n                    Ok(recovery) => {\n                        inner\n                            .snapshot_size\n                            .store(recovery.snapshot_size, Ordering::SeqCst);\n                        last_snapshot_lsn =\n                            recovery.id_for_next_log.checked_sub(1).unwrap();\n                    }\n                }\n            }\n            Err(Some(tx)) => {\n                drop(inner);\n                if let Err(e) = tx.send(()) {\n                    log::error!(\n                        \"log compactor failed to send shutdown ack to system: {e:?}\"\n                    );\n                }\n                return;\n            }\n            Err(None) => {\n                return;\n            }\n        }\n    }\n}\n\nfn set_error(\n    global_error: &AtomicPtr<(io::ErrorKind, String)>,\n    error: &io::Error,\n) {\n    let kind = error.kind();\n    let reason = error.to_string();\n\n    let boxed = Box::new((kind, reason));\n    let ptr = Box::into_raw(boxed);\n\n    if global_error\n        .compare_exchange(\n            std::ptr::null_mut(),\n            ptr,\n            Ordering::SeqCst,\n            Ordering::SeqCst,\n        )\n        .is_err()\n    {\n        // global fatal error already installed, drop this one\n        unsafe {\n            drop(Box::from_raw(ptr));\n        }\n    }\n}\n\nfn check_error(\n    global_error: &AtomicPtr<(io::ErrorKind, String)>,\n) -> io::Result<()> {\n    let err_ptr: *const (io::ErrorKind, String) =\n        global_error.load(Ordering::Acquire);\n\n    if err_ptr.is_null() {\n        Ok(())\n    } else {\n        let deref: &(io::ErrorKind, String) = unsafe { &*err_ptr };\n        Err(io::Error::new(deref.0, deref.1.clone()))\n    }\n}\n\n#[derive(Clone)]\nstruct Inner {\n    global_error: Arc<AtomicPtr<(io::ErrorKind, String)>>,\n    active_log: Arc<Mutex<LogAndStats>>,\n    snapshot_size: Arc<AtomicU64>,\n    storage_directory: PathBuf,\n    directory_lock: Arc<fs::File>,\n    worker_outbox: Sender<WorkerMessage>,\n}\n\nimpl Drop for Inner {\n    fn drop(&mut self) {\n        // NB: this is the only place where the global error should be\n        // reclaimed in the whole sled codebase, as this Inner is only held\n        // by the background writer and the heap (in an Arc) so when this\n        // drop happens, it's because the whole system is going down, not\n        // because any particular Db instance that may have been cloned\n        // by a thread is dropping.\n        let error_ptr =\n            self.global_error.swap(std::ptr::null_mut(), Ordering::Acquire);\n        if !error_ptr.is_null() {\n            unsafe {\n                drop(Box::from_raw(error_ptr));\n            }\n        }\n    }\n}\n\nimpl MetadataStore {\n    pub fn get_global_error_arc(\n        &self,\n    ) -> Arc<AtomicPtr<(io::ErrorKind, String)>> {\n        self.inner.global_error.clone()\n    }\n\n    fn shutdown_inner(&mut self) {\n        let (tx, rx) = bounded(1);\n        if self.inner.worker_outbox.send(WorkerMessage::Shutdown(tx)).is_ok() {\n            let _ = rx.recv();\n        }\n\n        self.set_error(&io::Error::other(\n            \"system has been shut down\".to_string(),\n        ));\n\n        self.is_shut_down = true;\n    }\n\n    fn check_error(&self) -> io::Result<()> {\n        check_error(&self.inner.global_error)\n    }\n\n    fn set_error(&self, error: &io::Error) {\n        set_error(&self.inner.global_error, error);\n    }\n\n    /// Returns the writer handle `MetadataStore`, a sorted array of metadata, and a sorted array\n    /// of free keys.\n    pub fn recover<P: AsRef<Path>>(\n        storage_directory: P,\n    ) -> io::Result<(\n        // Metadata writer\n        MetadataStore,\n        // Metadata - node id, value, user data\n        Vec<UpdateMetadata>,\n    )> {\n        use fs2::FileExt;\n\n        // TODO NOCOMMIT\n        let sync_status = std::process::Command::new(\"sync\")\n            .status()\n            .map(|status| status.success());\n\n        if !matches!(sync_status, Ok(true)) {\n            log::warn!(\n                \"sync command before recovery failed: {:?}\",\n                sync_status\n            );\n        }\n\n        let path = storage_directory.as_ref();\n\n        // initialize directories if not present\n        if let Err(e) = fs::read_dir(path) {\n            if e.kind() == io::ErrorKind::NotFound {\n                fallible!(fs::create_dir_all(path));\n            }\n        }\n\n        let _ = fs::File::create(path.join(WARN));\n\n        let directory_lock = fallible!(fs::File::open(path));\n        fallible!(directory_lock.sync_all());\n        fallible!(directory_lock.try_lock_exclusive());\n\n        let recovery =\n            MetadataStore::recover_inner(&storage_directory, &directory_lock)?;\n\n        let new_log = LogAndStats {\n            log_sequence_number: recovery.id_for_next_log,\n            bytes_written: 0,\n            file: fallible!(fs::File::create(log_path(\n                path,\n                recovery.id_for_next_log\n            ))),\n        };\n\n        let (tx, rx) = unbounded();\n\n        let inner = Inner {\n            snapshot_size: Arc::new(recovery.snapshot_size.into()),\n            storage_directory: path.into(),\n            directory_lock: Arc::new(directory_lock),\n            global_error: Default::default(),\n            active_log: Arc::new(Mutex::new(new_log)),\n            worker_outbox: tx,\n        };\n\n        let worker_inner = inner.clone();\n\n        let spawn_res = std::thread::Builder::new()\n            .name(\"sled_flusher\".into())\n            .spawn(move || {\n                worker(\n                    rx,\n                    recovery.id_for_next_log.checked_sub(1).unwrap(),\n                    worker_inner,\n                )\n            });\n\n        if let Err(e) = spawn_res {\n            return Err(io::Error::other(format!(\n                \"unable to spawn metadata compactor thread for sled database: {:?}\",\n                e\n            )));\n        }\n\n        Ok((MetadataStore { inner, is_shut_down: false }, recovery.recovered))\n    }\n\n    /// Returns the recovered mappings, the id for the next log file, the highest allocated object id, and the set of free ids\n    fn recover_inner<P: AsRef<Path>>(\n        storage_directory: P,\n        directory_lock: &fs::File,\n    ) -> io::Result<MetadataRecovery> {\n        let path = storage_directory.as_ref();\n\n        log::debug!(\"opening MetadataStore at {:?}\", path);\n\n        let (log_ids, snapshot_id_opt) = enumerate_logs_and_snapshot(path)?;\n\n        read_snapshot_and_apply_logs(\n            path,\n            log_ids,\n            snapshot_id_opt,\n            directory_lock,\n        )\n    }\n\n    /// Write a batch of metadata. `None` for the second half of the outer tuple represents a\n    /// deletion. Returns the bytes written.\n    pub fn write_batch(&self, batch: &[UpdateMetadata]) -> io::Result<u64> {\n        self.check_error()?;\n\n        let batch_bytes = serialize_batch(batch);\n        let ret = batch_bytes.len() as u64;\n\n        let mut log = self.inner.active_log.lock();\n\n        if let Err(e) = maybe!(log.file.write_all(&batch_bytes)) {\n            self.set_error(&e);\n            return Err(e);\n        }\n\n        if let Err(e) = maybe!(log.file.sync_all())\n            .and_then(|_| self.inner.directory_lock.sync_all())\n        {\n            self.set_error(&e);\n            return Err(e);\n        }\n\n        log.bytes_written += batch_bytes.len() as u64;\n\n        if log.bytes_written\n            > self.inner.snapshot_size.load(Ordering::Acquire).max(64 * 1024)\n        {\n            let next_offset = log.log_sequence_number + 1;\n            let next_path =\n                log_path(&self.inner.storage_directory, next_offset);\n\n            // open new log\n            let mut next_log_file_opts = fs::OpenOptions::new();\n            next_log_file_opts.create(true).read(true).write(true);\n\n            let next_log_file = match maybe!(next_log_file_opts.open(next_path))\n            {\n                Ok(nlf) => nlf,\n                Err(e) => {\n                    self.set_error(&e);\n                    return Err(e);\n                }\n            };\n\n            let next_log_and_stats = LogAndStats {\n                file: next_log_file,\n                log_sequence_number: next_offset,\n                bytes_written: 0,\n            };\n\n            // replace log\n            let old_log_and_stats =\n                std::mem::replace(&mut *log, next_log_and_stats);\n\n            // send to snapshot writer\n            self.inner\n                .worker_outbox\n                .send(WorkerMessage::LogReadyToCompact {\n                    log_and_stats: old_log_and_stats,\n                })\n                .expect(\"unable to send log to compact to worker\");\n        }\n\n        Ok(ret)\n    }\n}\n\nfn serialize_batch(batch: &[UpdateMetadata]) -> Vec<u8> {\n    // we initialize the vector to contain placeholder bytes for the frame length\n    let batch_bytes = 0_u64.to_le_bytes().to_vec();\n\n    // write format:\n    //  6 byte LE frame length (in bytes, not items)\n    //  2 byte crc of the frame length\n    //  payload:\n    //      zstd encoded 8 byte LE key\n    //      zstd encoded 8 byte LE value\n    //      repeated for each kv pair\n    //  LE encoded crc32 of length + payload raw bytes, XOR 0xAF to make non-zero in empty case\n    let mut batch_encoder = ZstdEncoder::new(batch_bytes, ZSTD_LEVEL).unwrap();\n\n    for update_metadata in batch {\n        match update_metadata {\n            UpdateMetadata::Store {\n                object_id,\n                collection_id,\n                low_key,\n                location,\n            } => {\n                batch_encoder\n                    .write_all(&object_id.0.get().to_le_bytes())\n                    .unwrap();\n                batch_encoder\n                    .write_all(&collection_id.0.to_le_bytes())\n                    .unwrap();\n                batch_encoder.write_all(&location.get().to_le_bytes()).unwrap();\n\n                let low_key_len: u64 = low_key.len() as u64;\n                batch_encoder.write_all(&low_key_len.to_le_bytes()).unwrap();\n                batch_encoder.write_all(low_key).unwrap();\n            }\n            UpdateMetadata::Free { object_id, collection_id } => {\n                batch_encoder\n                    .write_all(&object_id.0.get().to_le_bytes())\n                    .unwrap();\n                batch_encoder\n                    .write_all(&collection_id.0.to_le_bytes())\n                    .unwrap();\n                // heap location\n                batch_encoder.write_all(&0_u64.to_le_bytes()).unwrap();\n                // metadata len\n                batch_encoder.write_all(&0_u64.to_le_bytes()).unwrap();\n            }\n        }\n    }\n\n    let mut batch_bytes = batch_encoder.finish().unwrap();\n\n    let batch_len = batch_bytes.len().checked_sub(8).unwrap();\n    batch_bytes[..8].copy_from_slice(&batch_len.to_le_bytes());\n    assert_eq!(&[0, 0], &batch_bytes[6..8]);\n\n    let len_hash: [u8; 2] =\n        (crc32fast::hash(&batch_bytes[..6]) as u16).to_le_bytes();\n\n    batch_bytes[6..8].copy_from_slice(&len_hash);\n\n    let hash: u32 = crc32fast::hash(&batch_bytes) ^ 0xAF;\n    let hash_bytes: [u8; 4] = hash.to_le_bytes();\n    batch_bytes.extend_from_slice(&hash_bytes);\n\n    batch_bytes\n}\n\nfn read_frame(\n    file: &mut fs::File,\n    reusable_frame_buffer: &mut Vec<u8>,\n) -> io::Result<Vec<UpdateMetadata>> {\n    let mut frame_size_with_crc_buf: [u8; 8] = [0; 8];\n    // TODO only break if UnexpectedEof, otherwise propagate\n    fallible!(file.read_exact(&mut frame_size_with_crc_buf));\n\n    let expected_len_hash_buf =\n        [frame_size_with_crc_buf[6], frame_size_with_crc_buf[7]];\n\n    let actual_len_hash_buf: [u8; 2] =\n        (crc32fast::hash(&frame_size_with_crc_buf[..6]) as u16).to_le_bytes();\n\n    // clear crc bytes before turning into usize\n    let mut frame_size_buf = frame_size_with_crc_buf;\n    frame_size_buf[6] = 0;\n    frame_size_buf[7] = 0;\n\n    if actual_len_hash_buf != expected_len_hash_buf {\n        return Err(annotate!(io::Error::new(\n            io::ErrorKind::InvalidData,\n            \"corrupt frame length\"\n        )));\n    }\n\n    let len_u64: u64 = u64::from_le_bytes(frame_size_buf);\n    let len: usize = usize::try_from(len_u64).unwrap();\n\n    reusable_frame_buffer.clear();\n    reusable_frame_buffer.reserve(len + 12);\n    unsafe {\n        reusable_frame_buffer.set_len(len + 12);\n    }\n    reusable_frame_buffer[..8].copy_from_slice(&frame_size_with_crc_buf);\n\n    fallible!(file.read_exact(&mut reusable_frame_buffer[8..]));\n\n    let crc_actual = crc32fast::hash(&reusable_frame_buffer[..len + 8]) ^ 0xAF;\n    let crc_recorded = u32::from_le_bytes([\n        reusable_frame_buffer[len + 8],\n        reusable_frame_buffer[len + 9],\n        reusable_frame_buffer[len + 10],\n        reusable_frame_buffer[len + 11],\n    ]);\n\n    if crc_actual != crc_recorded {\n        log::warn!(\"encountered incorrect crc for batch in log\");\n        return Err(annotate!(io::Error::new(\n            io::ErrorKind::InvalidData,\n            \"crc mismatch for read of batch frame\",\n        )));\n    }\n\n    let mut ret = vec![];\n\n    let mut decoder = ZstdDecoder::new(&reusable_frame_buffer[8..len + 8])\n        .expect(\"failed to create zstd decoder\");\n\n    let mut object_id_buf: [u8; 8] = [0; 8];\n    let mut collection_id_buf: [u8; 8] = [0; 8];\n    let mut location_buf: [u8; 8] = [0; 8];\n    let mut low_key_len_buf: [u8; 8] = [0; 8];\n    let mut low_key_buf = vec![];\n    loop {\n        let first_read_res = decoder\n            .read_exact(&mut object_id_buf)\n            .and_then(|_| decoder.read_exact(&mut collection_id_buf))\n            .and_then(|_| decoder.read_exact(&mut location_buf))\n            .and_then(|_| decoder.read_exact(&mut low_key_len_buf));\n\n        if let Err(e) = first_read_res {\n            if e.kind() != io::ErrorKind::UnexpectedEof {\n                return Err(e);\n            } else {\n                break;\n            }\n        }\n\n        let object_id_u64 = u64::from_le_bytes(object_id_buf);\n\n        let object_id = if let Some(object_id) = ObjectId::new(object_id_u64) {\n            object_id\n        } else {\n            return Err(annotate!(io::Error::new(\n                io::ErrorKind::InvalidData,\n                \"corrupt object ID 0 somehow passed crc check\"\n            )));\n        };\n\n        let collection_id = CollectionId(u64::from_le_bytes(collection_id_buf));\n        let location = u64::from_le_bytes(location_buf);\n\n        let low_key_len_raw = u64::from_le_bytes(low_key_len_buf);\n        let low_key_len = usize::try_from(low_key_len_raw).unwrap();\n\n        low_key_buf.reserve(low_key_len);\n        unsafe {\n            low_key_buf.set_len(low_key_len);\n        }\n\n        decoder\n            .read_exact(&mut low_key_buf)\n            .expect(\"we expect reads from crc-verified buffers to succeed\");\n\n        if let Some(location_nzu) = NonZeroU64::new(location) {\n            let low_key = InlineArray::from(&*low_key_buf);\n\n            ret.push(UpdateMetadata::Store {\n                object_id,\n                collection_id,\n                location: location_nzu,\n                low_key,\n            });\n        } else {\n            ret.push(UpdateMetadata::Free { object_id, collection_id });\n        }\n    }\n\n    Ok(ret)\n}\n\n// returns the deduplicated data in this log, along with an optional offset where a\n// final torn write occurred.\nfn read_log(\n    directory_path: &Path,\n    lsn: u64,\n) -> io::Result<FnvHashMap<ObjectId, UpdateMetadata>> {\n    log::trace!(\"reading log {lsn}\");\n    let mut ret = FnvHashMap::default();\n\n    let mut file = fallible!(fs::File::open(log_path(directory_path, lsn)));\n\n    let mut reusable_frame_buffer: Vec<u8> = vec![];\n\n    while let Ok(frame) = read_frame(&mut file, &mut reusable_frame_buffer) {\n        for update_metadata in frame {\n            ret.insert(update_metadata.object_id(), update_metadata);\n        }\n    }\n\n    log::trace!(\"recovered {} items in log {}\", ret.len(), lsn);\n\n    Ok(ret)\n}\n\n/// returns the data from the snapshot as well as the size of the snapshot\nfn read_snapshot(\n    directory_path: &Path,\n    lsn: u64,\n) -> io::Result<(FnvHashMap<ObjectId, UpdateMetadata>, u64)> {\n    log::trace!(\"reading snapshot {lsn}\");\n    let mut reusable_frame_buffer: Vec<u8> = vec![];\n    let mut file =\n        fallible!(fs::File::open(snapshot_path(directory_path, lsn, false)));\n    let size = fallible!(file.metadata()).len();\n    let raw_frame = read_frame(&mut file, &mut reusable_frame_buffer)?;\n\n    let frame: FnvHashMap<ObjectId, UpdateMetadata> = raw_frame\n        .into_iter()\n        .map(|update_metadata| (update_metadata.object_id(), update_metadata))\n        .collect();\n\n    log::trace!(\"recovered {} items in snapshot {}\", frame.len(), lsn);\n\n    Ok((frame, size))\n}\n\nfn log_path(directory_path: &Path, id: u64) -> PathBuf {\n    directory_path.join(format!(\"{LOG_PREFIX}_{:016x}\", id))\n}\n\nfn snapshot_path(directory_path: &Path, id: u64, temporary: bool) -> PathBuf {\n    if temporary {\n        directory_path\n            .join(format!(\"{SNAPSHOT_PREFIX}_{:016x}{TMP_SUFFIX}\", id))\n    } else {\n        directory_path.join(format!(\"{SNAPSHOT_PREFIX}_{:016x}\", id))\n    }\n}\n\nfn enumerate_logs_and_snapshot(\n    directory_path: &Path,\n) -> io::Result<(BTreeSet<u64>, Option<u64>)> {\n    let mut logs = BTreeSet::new();\n    let mut snapshot: Option<u64> = None;\n\n    for dir_entry_res in fallible!(fs::read_dir(directory_path)) {\n        let dir_entry = fallible!(dir_entry_res);\n        let file_name = if let Ok(f) = dir_entry.file_name().into_string() {\n            f\n        } else {\n            log::warn!(\n                \"skipping unexpected file with non-unicode name {:?}\",\n                dir_entry.file_name()\n            );\n            continue;\n        };\n\n        if file_name.ends_with(TMP_SUFFIX) {\n            log::warn!(\"removing incomplete snapshot rewrite {file_name:?}\");\n            fallible!(fs::remove_file(directory_path.join(file_name)));\n        } else if file_name.starts_with(LOG_PREFIX) {\n            let start = LOG_PREFIX.len() + 1;\n            let stop = start + 16;\n\n            if let Ok(id) = u64::from_str_radix(&file_name[start..stop], 16) {\n                logs.insert(id);\n            } else {\n                todo!()\n            }\n        } else if file_name.starts_with(SNAPSHOT_PREFIX) {\n            let start = SNAPSHOT_PREFIX.len() + 1;\n            let stop = start + 16;\n\n            if let Ok(id) = u64::from_str_radix(&file_name[start..stop], 16) {\n                if let Some(snap_id) = snapshot {\n                    if snap_id < id {\n                        log::warn!(\n                            \"removing stale snapshot {id} that is superceded by snapshot {id}\"\n                        );\n\n                        if let Err(e) = fs::remove_file(&file_name) {\n                            log::warn!(\n                                \"failed to remove stale snapshot file {:?}: {:?}\",\n                                file_name,\n                                e\n                            );\n                        }\n\n                        snapshot = Some(id);\n                    }\n                } else {\n                    snapshot = Some(id);\n                }\n            } else {\n                todo!()\n            }\n        }\n    }\n\n    let snap_id = snapshot.unwrap_or(0);\n    for stale_log_id in logs.range(..=snap_id) {\n        let file_name = log_path(directory_path, *stale_log_id);\n\n        log::warn!(\n            \"removing stale log {file_name:?} that is contained within snapshot {snap_id}\"\n        );\n\n        fallible!(fs::remove_file(file_name));\n    }\n    logs.retain(|l| *l > snap_id);\n\n    Ok((logs, snapshot))\n}\n\nfn read_snapshot_and_apply_logs(\n    path: &Path,\n    log_ids: BTreeSet<u64>,\n    snapshot_id_opt: Option<u64>,\n    locked_directory: &fs::File,\n) -> io::Result<MetadataRecovery> {\n    let (snapshot_tx, snapshot_rx) = bounded(1);\n    if let Some(snapshot_id) = snapshot_id_opt {\n        let path: PathBuf = path.into();\n        rayon::spawn(move || {\n            let snap_res = read_snapshot(&path, snapshot_id)\n                .map(|(snapshot, _snapshot_len)| snapshot);\n            snapshot_tx.send(snap_res).unwrap();\n        });\n    } else {\n        snapshot_tx.send(Ok(Default::default())).unwrap();\n    }\n\n    let mut max_log_id = snapshot_id_opt.unwrap_or(0);\n\n    let log_data_res: io::Result<\n        Vec<(u64, FnvHashMap<ObjectId, UpdateMetadata>)>,\n    > = (&log_ids) //.iter().collect::<Vec<_>>())\n        .into_par_iter()\n        .map(move |log_id| {\n            if let Some(snapshot_id) = snapshot_id_opt {\n                assert!(*log_id > snapshot_id);\n            }\n\n            let log_data = read_log(path, *log_id)?;\n\n            Ok((*log_id, log_data))\n        })\n        .collect();\n\n    let mut recovered: FnvHashMap<ObjectId, UpdateMetadata> =\n        snapshot_rx.recv().unwrap()?;\n\n    log::trace!(\"recovered snapshot contains {recovered:?}\");\n\n    for (log_id, log_datum) in log_data_res? {\n        max_log_id = max_log_id.max(log_id);\n\n        for (object_id, update_metadata) in log_datum {\n            if matches!(update_metadata, UpdateMetadata::Store { .. }) {\n                recovered.insert(object_id, update_metadata);\n            } else {\n                let previous = recovered.remove(&object_id);\n                if previous.is_none() {\n                    log::trace!(\n                        \"recovered a Free for {object_id:?} without a preceeding Store\"\n                    );\n                }\n            }\n        }\n    }\n\n    let mut recovered: Vec<UpdateMetadata> = recovered.into_values().collect();\n\n    recovered.par_sort_unstable();\n\n    // write fresh snapshot with recovered data\n    let new_snapshot_data = serialize_batch(&recovered);\n    let snapshot_size = new_snapshot_data.len() as u64;\n\n    let new_snapshot_tmp_path = snapshot_path(path, max_log_id, true);\n    log::trace!(\"writing snapshot to {new_snapshot_tmp_path:?}\");\n\n    let mut snapshot_file_opts = fs::OpenOptions::new();\n    snapshot_file_opts.create(true).read(false).write(true);\n\n    let mut snapshot_file =\n        fallible!(snapshot_file_opts.open(&new_snapshot_tmp_path));\n\n    fallible!(snapshot_file.write_all(&new_snapshot_data));\n    drop(new_snapshot_data);\n\n    fallible!(snapshot_file.sync_all());\n\n    let new_snapshot_path = snapshot_path(path, max_log_id, false);\n    log::trace!(\"renaming written snapshot to {new_snapshot_path:?}\");\n    fallible!(fs::rename(new_snapshot_tmp_path, new_snapshot_path));\n    fallible!(locked_directory.sync_all());\n\n    for log_id in &log_ids {\n        let log_path = log_path(path, *log_id);\n        fallible!(fs::remove_file(log_path));\n    }\n\n    if let Some(old_snapshot_id) = snapshot_id_opt {\n        let old_snapshot_path = snapshot_path(path, old_snapshot_id, false);\n        fallible!(fs::remove_file(old_snapshot_path));\n    }\n\n    Ok(MetadataRecovery {\n        recovered,\n        id_for_next_log: max_log_id + 1,\n        snapshot_size,\n    })\n}\n"
  },
  {
    "path": "src/object_cache.rs",
    "content": "use std::cell::RefCell;\nuse std::collections::HashMap;\nuse std::io;\nuse std::sync::Arc;\nuse std::sync::atomic::{AtomicPtr, AtomicU64, Ordering};\nuse std::time::{Duration, Instant};\n\nuse cache_advisor::CacheAdvisor;\nuse concurrent_map::{ConcurrentMap, Minimum};\nuse fault_injection::annotate;\nuse inline_array::InlineArray;\nuse parking_lot::RwLock;\n\nuse crate::*;\n\n#[derive(Debug, Copy, Clone)]\npub struct CacheStats {\n    pub cache_hits: u64,\n    pub cache_misses: u64,\n    pub cache_hit_ratio: f32,\n    pub max_read_io_latency_us: u64,\n    pub sum_read_io_latency_us: u64,\n    pub deserialization_latency_max_us: u64,\n    pub deserialization_latency_sum_us: u64,\n    pub heap: HeapStats,\n    pub flush_max: FlushStats,\n    pub flush_sum: FlushStats,\n    pub compacted_heap_slots: u64,\n    pub tree_leaves_merged: u64,\n}\n\n#[derive(Default, Debug, Clone, Copy)]\npub struct FlushStats {\n    pub pre_block_on_previous_flush: Duration,\n    pub pre_block_on_current_quiescence: Duration,\n    pub serialization_latency: Duration,\n    pub compute_defrag_latency: Duration,\n    pub storage_latency: Duration,\n    pub post_write_eviction_latency: Duration,\n    pub objects_flushed: u64,\n    pub write_batch: WriteBatchStats,\n}\n\nimpl FlushStats {\n    pub fn sum(&self, other: &FlushStats) -> FlushStats {\n        use std::ops::Add;\n\n        FlushStats {\n            pre_block_on_previous_flush: self\n                .pre_block_on_previous_flush\n                .add(other.pre_block_on_previous_flush),\n            pre_block_on_current_quiescence: self\n                .pre_block_on_current_quiescence\n                .add(other.pre_block_on_current_quiescence),\n            compute_defrag_latency: self\n                .compute_defrag_latency\n                .add(other.compute_defrag_latency),\n            serialization_latency: self\n                .serialization_latency\n                .add(other.serialization_latency),\n            storage_latency: self.storage_latency.add(other.storage_latency),\n            post_write_eviction_latency: self\n                .post_write_eviction_latency\n                .add(other.post_write_eviction_latency),\n            objects_flushed: self.objects_flushed.add(other.objects_flushed),\n            write_batch: self.write_batch.sum(&other.write_batch),\n        }\n    }\n    pub fn max(&self, other: &FlushStats) -> FlushStats {\n        FlushStats {\n            pre_block_on_previous_flush: self\n                .pre_block_on_previous_flush\n                .max(other.pre_block_on_previous_flush),\n            pre_block_on_current_quiescence: self\n                .pre_block_on_current_quiescence\n                .max(other.pre_block_on_current_quiescence),\n            compute_defrag_latency: self\n                .compute_defrag_latency\n                .max(other.compute_defrag_latency),\n            serialization_latency: self\n                .serialization_latency\n                .max(other.serialization_latency),\n            storage_latency: self.storage_latency.max(other.storage_latency),\n            post_write_eviction_latency: self\n                .post_write_eviction_latency\n                .max(other.post_write_eviction_latency),\n            objects_flushed: self.objects_flushed.max(other.objects_flushed),\n            write_batch: self.write_batch.max(&other.write_batch),\n        }\n    }\n}\n\n#[derive(Clone, Debug, PartialEq)]\npub enum Dirty<const LEAF_FANOUT: usize> {\n    NotYetSerialized {\n        low_key: InlineArray,\n        node: Object<LEAF_FANOUT>,\n        collection_id: CollectionId,\n    },\n    CooperativelySerialized {\n        object_id: ObjectId,\n        collection_id: CollectionId,\n        low_key: InlineArray,\n        data: Arc<Vec<u8>>,\n        mutation_count: u64,\n    },\n    MergedAndDeleted {\n        object_id: ObjectId,\n        collection_id: CollectionId,\n    },\n}\n\nimpl<const LEAF_FANOUT: usize> Dirty<LEAF_FANOUT> {\n    pub fn is_final_state(&self) -> bool {\n        match self {\n            Dirty::NotYetSerialized { .. } => false,\n            Dirty::CooperativelySerialized { .. } => true,\n            Dirty::MergedAndDeleted { .. } => true,\n        }\n    }\n}\n\n#[derive(Debug, Default, Clone, Copy)]\nstruct FlushStatTracker {\n    count: u64,\n    sum: FlushStats,\n    max: FlushStats,\n}\n\n#[derive(Debug, Default)]\npub(crate) struct ReadStatTracker {\n    pub cache_hits: AtomicU64,\n    pub cache_misses: AtomicU64,\n    pub max_read_io_latency_us: AtomicU64,\n    pub sum_read_io_latency_us: AtomicU64,\n    pub max_deserialization_latency_us: AtomicU64,\n    pub sum_deserialization_latency_us: AtomicU64,\n}\n\n#[derive(Clone)]\npub struct ObjectCache<const LEAF_FANOUT: usize> {\n    pub config: Config,\n    global_error: Arc<AtomicPtr<(io::ErrorKind, String)>>,\n    pub object_id_index: ConcurrentMap<\n        ObjectId,\n        Object<LEAF_FANOUT>,\n        INDEX_FANOUT,\n        EBR_LOCAL_GC_BUFFER_SIZE,\n    >,\n    heap: Heap,\n    cache_advisor: RefCell<CacheAdvisor>,\n    flush_epoch: FlushEpochTracker,\n    dirty: ConcurrentMap<(FlushEpoch, ObjectId), Dirty<LEAF_FANOUT>, 4>,\n    compacted_heap_slots: Arc<AtomicU64>,\n    pub(super) tree_leaves_merged: Arc<AtomicU64>,\n    #[cfg(feature = \"for-internal-testing-only\")]\n    pub(super) event_verifier: Arc<crate::event_verifier::EventVerifier>,\n    invariants: Arc<FlushInvariants>,\n    flush_stats: Arc<RwLock<FlushStatTracker>>,\n    pub(super) read_stats: Arc<ReadStatTracker>,\n}\n\nimpl<const LEAF_FANOUT: usize> std::panic::RefUnwindSafe\n    for ObjectCache<LEAF_FANOUT>\n{\n}\n\nimpl<const LEAF_FANOUT: usize> ObjectCache<LEAF_FANOUT> {\n    /// Returns the recovered ObjectCache, the tree indexes, and a bool signifying whether the system\n    /// was recovered or not\n    pub fn recover(\n        config: &Config,\n    ) -> io::Result<(\n        ObjectCache<LEAF_FANOUT>,\n        HashMap<CollectionId, Index<LEAF_FANOUT>>,\n        bool,\n    )> {\n        let HeapRecovery { heap, recovered_nodes, was_recovered } =\n            Heap::recover(LEAF_FANOUT, config)?;\n\n        let (object_id_index, indices) = initialize(&recovered_nodes, &heap);\n\n        // validate recovery\n        for ObjectRecovery { object_id, collection_id, low_key } in\n            recovered_nodes\n        {\n            let index = indices.get(&collection_id).unwrap();\n            let node = index.get(&low_key).unwrap();\n            assert_eq!(node.object_id, object_id);\n        }\n\n        if config.cache_capacity_bytes < 256 {\n            log::debug!(\n                \"Db configured to have Config.cache_capacity_bytes \\\n                of under 256, so we will use the minimum of 256 bytes instead\"\n            );\n        }\n\n        if config.entry_cache_percent > 80 {\n            log::debug!(\n                \"Db configured to have Config.entry_cache_percent\\\n                of over 80%, so we will clamp it to the maximum of 80% instead\"\n            );\n        }\n\n        let pc = ObjectCache {\n            config: config.clone(),\n            object_id_index,\n            cache_advisor: RefCell::new(CacheAdvisor::new(\n                config.cache_capacity_bytes.max(256),\n                config.entry_cache_percent.min(80),\n            )),\n            global_error: heap.get_global_error_arc(),\n            heap,\n            dirty: Default::default(),\n            flush_epoch: Default::default(),\n            #[cfg(feature = \"for-internal-testing-only\")]\n            event_verifier: Arc::default(),\n            compacted_heap_slots: Arc::default(),\n            tree_leaves_merged: Arc::default(),\n            invariants: Arc::default(),\n            flush_stats: Arc::default(),\n            read_stats: Arc::default(),\n        };\n\n        Ok((pc, indices, was_recovered))\n    }\n\n    pub fn is_clean(&self) -> bool {\n        self.dirty.is_empty()\n    }\n\n    pub fn read(&self, object_id: ObjectId) -> Option<io::Result<Vec<u8>>> {\n        match self.heap.read(object_id) {\n            Some(Ok(buf)) => Some(Ok(buf)),\n            Some(Err(e)) => Some(Err(annotate!(e))),\n            None => None,\n        }\n    }\n\n    pub fn stats(&self) -> CacheStats {\n        let flush_stats = { *self.flush_stats.read() };\n        let cache_hits = self.read_stats.cache_hits.load(Ordering::Acquire);\n        let cache_misses = self.read_stats.cache_misses.load(Ordering::Acquire);\n        let cache_hit_ratio =\n            cache_hits as f32 / (cache_hits + cache_misses).max(1) as f32;\n\n        CacheStats {\n            cache_hits,\n            cache_misses,\n            cache_hit_ratio,\n            compacted_heap_slots: self\n                .compacted_heap_slots\n                .load(Ordering::Acquire),\n            tree_leaves_merged: self.tree_leaves_merged.load(Ordering::Acquire),\n            heap: self.heap.stats(),\n            flush_max: flush_stats.max,\n            flush_sum: flush_stats.sum,\n            deserialization_latency_max_us: self\n                .read_stats\n                .max_deserialization_latency_us\n                .load(Ordering::Acquire),\n            deserialization_latency_sum_us: self\n                .read_stats\n                .sum_deserialization_latency_us\n                .load(Ordering::Acquire),\n            max_read_io_latency_us: self\n                .read_stats\n                .max_read_io_latency_us\n                .load(Ordering::Acquire),\n            sum_read_io_latency_us: self\n                .read_stats\n                .sum_read_io_latency_us\n                .load(Ordering::Acquire),\n        }\n    }\n\n    pub fn check_error(&self) -> io::Result<()> {\n        let err_ptr: *const (io::ErrorKind, String) =\n            self.global_error.load(Ordering::Acquire);\n\n        if err_ptr.is_null() {\n            Ok(())\n        } else {\n            let deref: &(io::ErrorKind, String) = unsafe { &*err_ptr };\n            Err(io::Error::new(deref.0, deref.1.clone()))\n        }\n    }\n\n    pub fn set_error(&self, error: &io::Error) {\n        let kind = error.kind();\n        let reason = error.to_string();\n\n        let boxed = Box::new((kind, reason));\n        let ptr = Box::into_raw(boxed);\n\n        if self\n            .global_error\n            .compare_exchange(\n                std::ptr::null_mut(),\n                ptr,\n                Ordering::SeqCst,\n                Ordering::SeqCst,\n            )\n            .is_err()\n        {\n            // global fatal error already installed, drop this one\n            unsafe {\n                drop(Box::from_raw(ptr));\n            }\n        }\n    }\n\n    pub fn allocate_default_node(\n        &self,\n        collection_id: CollectionId,\n    ) -> Object<LEAF_FANOUT> {\n        let object_id = self.allocate_object_id(FlushEpoch::MIN);\n\n        let node = Object {\n            object_id,\n            collection_id,\n            low_key: InlineArray::default(),\n            inner: Arc::new(RwLock::new(CacheBox {\n                leaf: Some(Box::new(Leaf::empty())),\n                logged_index: BTreeMap::default(),\n            })),\n        };\n\n        self.object_id_index.insert(object_id, node.clone());\n\n        node\n    }\n\n    pub fn allocate_object_id(\n        &self,\n        #[allow(unused)] flush_epoch: FlushEpoch,\n    ) -> ObjectId {\n        let object_id = self.heap.allocate_object_id();\n\n        #[cfg(feature = \"for-internal-testing-only\")]\n        {\n            self.event_verifier.mark(\n                object_id,\n                flush_epoch,\n                event_verifier::State::CleanPagedIn,\n                concat!(file!(), ':', line!(), \":allocated\"),\n            );\n        }\n\n        object_id\n    }\n\n    pub fn current_flush_epoch(&self) -> FlushEpoch {\n        self.flush_epoch.current_flush_epoch()\n    }\n\n    pub fn check_into_flush_epoch(&self) -> FlushEpochGuard {\n        self.flush_epoch.check_in()\n    }\n\n    pub fn install_dirty(\n        &self,\n        flush_epoch: FlushEpoch,\n        object_id: ObjectId,\n        dirty: Dirty<LEAF_FANOUT>,\n    ) {\n        // dirty can transition from:\n        // None -> NotYetSerialized\n        // None -> MergedAndDeleted\n        // None -> CooperativelySerialized\n        //\n        // NotYetSerialized -> MergedAndDeleted\n        // NotYetSerialized -> CooperativelySerialized\n        //\n        // if the new Dirty is final, we must assert that\n        // we are transitioning from None or NotYetSerialized.\n        //\n        // if the new Dirty is not final, we must assert\n        // that the old value is also not final.\n\n        let last_dirty_opt = self.dirty.insert((flush_epoch, object_id), dirty);\n\n        if let Some(last_dirty) = last_dirty_opt {\n            assert!(\n                !last_dirty.is_final_state(),\n                \"tried to install another Dirty marker for a node that is already\n                finalized for this flush epoch. \\nflush_epoch: {:?}\\nlast: {:?}\",\n                flush_epoch, last_dirty,\n            );\n        }\n    }\n\n    // NB: must not be called while holding a leaf lock - which also means\n    // that no two LeafGuards can be held concurrently in the same scope due to\n    // this being called in the destructor.\n    pub fn mark_access_and_evict(\n        &self,\n        accessed_object_id: ObjectId,\n        size: usize,\n        #[allow(unused)] flush_epoch: FlushEpoch,\n    ) -> io::Result<()> {\n        let mut ca = self.cache_advisor.borrow_mut();\n        let to_evict = ca.accessed_reuse_buffer(*accessed_object_id, size);\n        let mut not_found = 0;\n        for (node_to_evict, _rough_size) in to_evict {\n            let object_id =\n                if let Some(object_id) = ObjectId::new(*node_to_evict) {\n                    object_id\n                } else {\n                    unreachable!(\"object ID must never have been 0\");\n                };\n\n            if accessed_object_id == object_id {\n                // TODO our own object was evicted, so\n                // set page out after current epoch (or just page out if clean?)\n                continue;\n            }\n\n            let node = if let Some(n) = self.object_id_index.get(&object_id) {\n                if *n.object_id != *node_to_evict {\n                    continue;\n                }\n                n\n            } else {\n                not_found += 1;\n                continue;\n            };\n\n            let mut write = node.inner.write();\n            if write.leaf.is_none() {\n                // already paged out\n                continue;\n            }\n            let leaf: &mut Leaf<LEAF_FANOUT> = write.leaf.as_mut().unwrap();\n\n            if let Some(dirty_epoch) = leaf.dirty_flush_epoch {\n                // We can't page out this leaf until it has been\n                // flushed, because its changes are not yet durable.\n                leaf.page_out_on_flush =\n                    leaf.page_out_on_flush.max(Some(dirty_epoch));\n            } else if let Some(max_unflushed_epoch) = leaf.max_unflushed_epoch {\n                leaf.page_out_on_flush =\n                    leaf.page_out_on_flush.max(Some(max_unflushed_epoch));\n            } else {\n                #[cfg(feature = \"for-internal-testing-only\")]\n                {\n                    self.event_verifier.mark(\n                        node.object_id,\n                        flush_epoch,\n                        event_verifier::State::PagedOut,\n                        concat!(file!(), ':', line!(), \":page-out\"),\n                    );\n                }\n                write.leaf = None;\n            }\n        }\n\n        if not_found > 0 {\n            log::trace!(\n                \"during cache eviction, did not find {} nodes that we were trying to evict\",\n                not_found\n            );\n        }\n\n        Ok(())\n    }\n\n    pub fn heap_object_id_pin(&self) -> ebr::Guard<'_, DeferredFree, 16, 16> {\n        self.heap.heap_object_id_pin()\n    }\n\n    pub fn flush(&self) -> io::Result<FlushStats> {\n        let mut write_batch = vec![];\n\n        log::trace!(\"advancing epoch\");\n        let (\n            previous_flush_complete_notifier,\n            this_vacant_notifier,\n            forward_flush_notifier,\n        ) = self.flush_epoch.roll_epoch_forward();\n\n        let before_previous_block = Instant::now();\n\n        log::trace!(\n            \"waiting for previous flush of {:?} to complete\",\n            previous_flush_complete_notifier.epoch()\n        );\n        let previous_epoch =\n            previous_flush_complete_notifier.wait_for_complete();\n\n        let pre_block_on_previous_flush = before_previous_block.elapsed();\n\n        let before_current_quiescence = Instant::now();\n\n        log::trace!(\n            \"waiting for our epoch {:?} to become vacant\",\n            this_vacant_notifier.epoch()\n        );\n\n        assert_eq!(previous_epoch.increment(), this_vacant_notifier.epoch());\n\n        let flush_through_epoch: FlushEpoch =\n            this_vacant_notifier.wait_for_complete();\n\n        let pre_block_on_current_quiescence =\n            before_current_quiescence.elapsed();\n\n        self.invariants.mark_flushing_epoch(flush_through_epoch);\n\n        let mut objects_to_defrag = self.heap.objects_to_defrag();\n\n        let flush_boundary = (flush_through_epoch.increment(), ObjectId::MIN);\n\n        let mut evict_after_flush = vec![];\n\n        let before_serialization = Instant::now();\n\n        for ((dirty_epoch, dirty_object_id), dirty_value_initial_read) in\n            self.dirty.range(..flush_boundary)\n        {\n            objects_to_defrag.remove(&dirty_object_id);\n\n            let dirty_value = self\n                .dirty\n                .remove(&(dirty_epoch, dirty_object_id))\n                .expect(\"violation of flush responsibility\");\n\n            if let Dirty::NotYetSerialized { .. } = &dirty_value {\n                assert_eq!(dirty_value_initial_read, dirty_value);\n            }\n\n            // drop is necessary to increase chance of Arc strong count reaching 1\n            // while taking ownership of the value\n            drop(dirty_value_initial_read);\n\n            assert_eq!(dirty_epoch, flush_through_epoch);\n\n            match dirty_value {\n                Dirty::MergedAndDeleted { object_id, collection_id } => {\n                    assert_eq!(object_id, dirty_object_id);\n\n                    log::trace!(\n                        \"MergedAndDeleted for {:?}, adding None to write_batch\",\n                        object_id\n                    );\n                    write_batch.push(Update::Free { object_id, collection_id });\n\n                    #[cfg(feature = \"for-internal-testing-only\")]\n                    {\n                        self.event_verifier.mark(\n                            object_id,\n                            dirty_epoch,\n                            event_verifier::State::AddedToWriteBatch,\n                            concat!(\n                                file!(),\n                                ':',\n                                line!(),\n                                \":flush-merged-and-deleted\"\n                            ),\n                        );\n                    }\n                }\n                Dirty::CooperativelySerialized {\n                    object_id: _,\n                    collection_id,\n                    low_key,\n                    mutation_count: _,\n                    mut data,\n                } => {\n                    Arc::make_mut(&mut data);\n                    let data = Arc::into_inner(data).unwrap();\n                    write_batch.push(Update::Store {\n                        object_id: dirty_object_id,\n                        collection_id,\n                        low_key,\n                        data,\n                    });\n\n                    #[cfg(feature = \"for-internal-testing-only\")]\n                    {\n                        self.event_verifier.mark(\n                            dirty_object_id,\n                            dirty_epoch,\n                            event_verifier::State::AddedToWriteBatch,\n                            concat!(\n                                file!(),\n                                ':',\n                                line!(),\n                                \":flush-cooperative\"\n                            ),\n                        );\n                    }\n                }\n                Dirty::NotYetSerialized { low_key, collection_id, node } => {\n                    assert_eq!(low_key, node.low_key);\n                    assert_eq!(\n                        dirty_object_id, node.object_id,\n                        \"mismatched node ID for NotYetSerialized with low key {:?}\",\n                        low_key\n                    );\n                    let mut lock = node.inner.write();\n\n                    let leaf_ref: &mut Leaf<LEAF_FANOUT> = if let Some(\n                        lock_ref,\n                    ) =\n                        lock.leaf.as_mut()\n                    {\n                        lock_ref\n                    } else {\n                        #[cfg(feature = \"for-internal-testing-only\")]\n                        self.event_verifier\n                            .print_debug_history_for_object(dirty_object_id);\n\n                        panic!(\n                            \"failed to get lock for node that was NotYetSerialized, low key {:?} id {:?}\",\n                            low_key, node.object_id\n                        );\n                    };\n\n                    assert_eq!(leaf_ref.lo, low_key);\n\n                    let data = if leaf_ref.dirty_flush_epoch\n                        == Some(flush_through_epoch)\n                    {\n                        if let Some(deleted_at) = leaf_ref.deleted {\n                            #[cfg(feature = \"for-internal-testing-only\")]\n                            if deleted_at <= flush_through_epoch {\n                                println!(\n                                    \"{dirty_object_id:?} deleted at {deleted_at:?} \\\n                                    but we are flushing at {flush_through_epoch:?}\"\n                                );\n                                self.event_verifier\n                                    .print_debug_history_for_object(\n                                        dirty_object_id,\n                                    );\n                            }\n                            assert!(deleted_at > flush_through_epoch);\n                        }\n\n                        leaf_ref.max_unflushed_epoch =\n                            leaf_ref.dirty_flush_epoch.take();\n\n                        #[cfg(feature = \"for-internal-testing-only\")]\n                        {\n                            self.event_verifier.mark(\n                                dirty_object_id,\n                                dirty_epoch,\n                                event_verifier::State::AddedToWriteBatch,\n                                concat!(\n                                    file!(),\n                                    ':',\n                                    line!(),\n                                    \":flush-serialize\"\n                                ),\n                            );\n                        }\n\n                        leaf_ref.serialize(self.config.zstd_compression_level)\n                    } else {\n                        // Here we expect that there was a benign data race and that another thread\n                        // mutated the leaf after encountering it being dirty for our epoch, after\n                        // storing a CooperativelySerialized in the dirty map.\n                        let dirty_value_2_opt =\n                            self.dirty.remove(&(dirty_epoch, dirty_object_id));\n\n                        if let Some(Dirty::CooperativelySerialized {\n                            low_key: low_key_2,\n                            mutation_count: _,\n                            mut data,\n                            collection_id: ci2,\n                            object_id: ni2,\n                        }) = dirty_value_2_opt\n                        {\n                            assert_eq!(node.object_id, ni2);\n                            assert_eq!(node.object_id, dirty_object_id);\n                            assert_eq!(low_key, low_key_2);\n                            assert_eq!(node.low_key, low_key);\n                            assert_eq!(collection_id, ci2);\n                            Arc::make_mut(&mut data);\n\n                            #[cfg(feature = \"for-internal-testing-only\")]\n                            {\n                                self.event_verifier.mark(\n                                    dirty_object_id,\n                                    dirty_epoch,\n                                    event_verifier::State::AddedToWriteBatch,\n                                    concat!(\n                                        file!(),\n                                        ':',\n                                        line!(),\n                                        \":flush-laggy-cooperative\"\n                                    ),\n                                );\n                            }\n\n                            Arc::into_inner(data).unwrap()\n                        } else {\n                            log::error!(\n                                \"violation of flush responsibility for second read \\\n                                of expected cooperative serialization. leaf in question's \\\n                                dirty_flush_epoch is {:?}, our expected key was {:?}. node.deleted: {:?}\",\n                                leaf_ref.dirty_flush_epoch,\n                                (dirty_epoch, dirty_object_id),\n                                leaf_ref.deleted,\n                            );\n                            #[cfg(feature = \"for-internal-testing-only\")]\n                            self.event_verifier.print_debug_history_for_object(\n                                dirty_object_id,\n                            );\n                            unreachable!(\n                                \"a leaf was expected to be cooperatively serialized but it was not available. \\\n                                violation of flush responsibility for second read \\\n                                of expected cooperative serialization. leaf in question's \\\n                                dirty_flush_epoch is {:?}, our expected key was {:?}. node.deleted: {:?}\",\n                                leaf_ref.dirty_flush_epoch,\n                                (dirty_epoch, dirty_object_id),\n                                leaf_ref.deleted,\n                            );\n                        }\n                    };\n\n                    write_batch.push(Update::Store {\n                        object_id: dirty_object_id,\n                        collection_id,\n                        low_key,\n                        data,\n                    });\n\n                    if leaf_ref.page_out_on_flush == Some(flush_through_epoch) {\n                        // page_out_on_flush is set to false\n                        // on page-in due to serde(skip)\n                        evict_after_flush.push(node.clone());\n                    }\n                }\n            }\n        }\n\n        if !objects_to_defrag.is_empty() {\n            log::debug!(\n                \"objects to defrag (after flush loop): {}\",\n                objects_to_defrag.len()\n            );\n            self.compacted_heap_slots\n                .fetch_add(objects_to_defrag.len() as u64, Ordering::Relaxed);\n        }\n\n        let before_compute_defrag = Instant::now();\n\n        if cfg!(not(feature = \"monotonic-behavior\")) {\n            let mut object_not_found = 0;\n\n            for fragmented_object_id in objects_to_defrag {\n                let object_opt =\n                    self.object_id_index.get(&fragmented_object_id);\n\n                let object = if let Some(object) = object_opt {\n                    object\n                } else {\n                    object_not_found += 1;\n                    continue;\n                };\n\n                if let Some(ref inner) = object.inner.read().leaf {\n                    if let Some(dirty) = inner.dirty_flush_epoch {\n                        assert!(dirty > flush_through_epoch);\n                        // This object will be rewritten anyway when its dirty epoch gets flushed\n                        continue;\n                    }\n                }\n\n                let data = match self.read(fragmented_object_id) {\n                    Some(Ok(data)) => data,\n                    Some(Err(e)) => {\n                        let annotated = annotate!(e);\n                        log::error!(\n                            \"failed to read object during GC: {annotated:?}\"\n                        );\n                        continue;\n                    }\n                    None => {\n                        log::error!(\n                            \"failed to read object during GC: object not found\"\n                        );\n                        continue;\n                    }\n                };\n\n                write_batch.push(Update::Store {\n                    object_id: fragmented_object_id,\n                    collection_id: object.collection_id,\n                    low_key: object.low_key,\n                    data,\n                });\n            }\n\n            if object_not_found > 0 {\n                log::debug!(\n                    \"{} objects not found while defragmenting\",\n                    object_not_found\n                );\n            }\n        }\n\n        let compute_defrag_latency = before_compute_defrag.elapsed();\n\n        let serialization_latency = before_serialization.elapsed();\n\n        let before_storage = Instant::now();\n\n        let objects_flushed = write_batch.len() as u64;\n\n        #[cfg(feature = \"for-internal-testing-only\")]\n        let write_batch_object_ids: Vec<ObjectId> =\n            write_batch.iter().map(Update::object_id).collect();\n\n        let write_batch_stats = if objects_flushed > 0 {\n            let write_batch_stats = self.heap.write_batch(write_batch)?;\n            log::trace!(\n                \"marking {flush_through_epoch:?} as flushed - \\\n                {objects_flushed} objects written, {write_batch_stats:?}\",\n            );\n            write_batch_stats\n        } else {\n            WriteBatchStats::default()\n        };\n\n        let storage_latency = before_storage.elapsed();\n\n        #[cfg(feature = \"for-internal-testing-only\")]\n        {\n            for update_object_id in write_batch_object_ids {\n                self.event_verifier.mark(\n                    update_object_id,\n                    flush_through_epoch,\n                    event_verifier::State::Flushed,\n                    concat!(file!(), ':', line!(), \":flush-finished\"),\n                );\n            }\n        }\n\n        log::trace!(\n            \"marking the forward flush notifier that {:?} is flushed\",\n            flush_through_epoch\n        );\n\n        self.invariants.mark_flushed_epoch(flush_through_epoch);\n\n        forward_flush_notifier.mark_complete();\n\n        let before_eviction = Instant::now();\n\n        for node_to_evict in evict_after_flush {\n            // NB: since we dropped this leaf and lock after we marked its\n            // node in evict_after_flush, it's possible that it may have\n            // been written to afterwards.\n            let mut lock = node_to_evict.inner.write();\n            let leaf = lock.leaf.as_mut().unwrap();\n\n            if let Some(dirty_epoch) = leaf.dirty_flush_epoch {\n                if dirty_epoch != flush_through_epoch {\n                    continue;\n                }\n            } else {\n                continue;\n            }\n\n            #[cfg(feature = \"for-internal-testing-only\")]\n            {\n                self.event_verifier.mark(\n                    node_to_evict.object_id,\n                    flush_through_epoch,\n                    event_verifier::State::PagedOut,\n                    concat!(file!(), ':', line!(), \":page-out-after-flush\"),\n                );\n            }\n\n            lock.leaf = None;\n        }\n\n        let post_write_eviction_latency = before_eviction.elapsed();\n\n        // kick forward the low level epoch-based reclamation systems\n        // because this operation can cause a lot of garbage to build\n        // up, and this speeds up its reclamation.\n        self.flush_epoch.manually_advance_epoch();\n        self.heap.manually_advance_epoch();\n\n        let ret = FlushStats {\n            pre_block_on_current_quiescence,\n            pre_block_on_previous_flush,\n            serialization_latency,\n            storage_latency,\n            post_write_eviction_latency,\n            objects_flushed,\n            write_batch: write_batch_stats,\n            compute_defrag_latency,\n        };\n\n        let mut flush_stats = self.flush_stats.write();\n        flush_stats.count += 1;\n        flush_stats.max = flush_stats.max.max(&ret);\n        flush_stats.sum = flush_stats.sum.sum(&ret);\n\n        assert_eq!(self.dirty.range(..flush_boundary).count(), 0);\n\n        Ok(ret)\n    }\n}\n\nfn initialize<const LEAF_FANOUT: usize>(\n    recovered_nodes: &[ObjectRecovery],\n    heap: &Heap,\n) -> (\n    ConcurrentMap<\n        ObjectId,\n        Object<LEAF_FANOUT>,\n        INDEX_FANOUT,\n        EBR_LOCAL_GC_BUFFER_SIZE,\n    >,\n    HashMap<CollectionId, Index<LEAF_FANOUT>>,\n) {\n    let mut trees: HashMap<CollectionId, Index<LEAF_FANOUT>> = HashMap::new();\n\n    let object_id_index: ConcurrentMap<\n        ObjectId,\n        Object<LEAF_FANOUT>,\n        INDEX_FANOUT,\n        EBR_LOCAL_GC_BUFFER_SIZE,\n    > = ConcurrentMap::default();\n\n    for ObjectRecovery { object_id, collection_id, low_key } in recovered_nodes\n    {\n        let node = Object {\n            object_id: *object_id,\n            collection_id: *collection_id,\n            low_key: low_key.clone(),\n            inner: Arc::new(RwLock::new(CacheBox {\n                leaf: None,\n                logged_index: BTreeMap::default(),\n            })),\n        };\n\n        assert!(object_id_index.insert(*object_id, node.clone()).is_none());\n\n        let tree = trees.entry(*collection_id).or_default();\n\n        assert!(\n            tree.insert(low_key.clone(), node).is_none(),\n            \"inserted multiple objects with low key {:?}\",\n            low_key\n        );\n    }\n\n    // initialize default collections if not recovered\n    for collection_id in [NAME_MAPPING_COLLECTION_ID, DEFAULT_COLLECTION_ID] {\n        let tree = trees.entry(collection_id).or_default();\n\n        if tree.is_empty() {\n            let object_id = heap.allocate_object_id();\n\n            let initial_low_key = InlineArray::MIN;\n\n            let empty_node = Object {\n                object_id,\n                collection_id,\n                low_key: initial_low_key.clone(),\n                inner: Arc::new(RwLock::new(CacheBox {\n                    leaf: Some(Box::new(Leaf::empty())),\n                    logged_index: BTreeMap::default(),\n                })),\n            };\n\n            assert!(\n                object_id_index.insert(object_id, empty_node.clone()).is_none()\n            );\n\n            assert!(tree.insert(initial_low_key, empty_node).is_none());\n        } else {\n            assert!(\n                tree.contains_key(&InlineArray::MIN),\n                \"tree {:?} had no minimum node\",\n                collection_id\n            );\n        }\n    }\n\n    for (cid, tree) in &trees {\n        assert!(\n            tree.contains_key(&InlineArray::MIN),\n            \"tree {:?} had no minimum node\",\n            cid\n        );\n    }\n\n    (object_id_index, trees)\n}\n"
  },
  {
    "path": "src/object_location_mapper.rs",
    "content": "use std::num::NonZeroU64;\nuse std::sync::Arc;\nuse std::sync::atomic::{AtomicU64, Ordering};\n\nuse fnv::FnvHashSet;\nuse pagetable::PageTable;\n\nuse crate::{\n    Allocator, ObjectId,\n    heap::{N_SLABS, SlabAddress, UpdateMetadata},\n};\n\n#[derive(Debug, Default, Copy, Clone)]\npub struct AllocatorStats {\n    pub objects_allocated: u64,\n    pub objects_freed: u64,\n    pub heap_slots_allocated: u64,\n    pub heap_slots_freed: u64,\n}\n\n#[derive(Default)]\nstruct SlabTenancy {\n    slot_to_object_id: PageTable<AtomicU64>,\n    slot_allocator: Arc<Allocator>,\n}\n\nimpl SlabTenancy {\n    // returns (ObjectId, slot index) pairs\n    fn objects_to_defrag(\n        &self,\n        target_fill_ratio: f32,\n    ) -> Vec<(ObjectId, u64)> {\n        let (frag_min, frag_max) = if let Some(frag) =\n            self.slot_allocator.fragmentation_cutoff(target_fill_ratio)\n        {\n            frag\n        } else {\n            return vec![];\n        };\n\n        let mut ret = vec![];\n\n        for fragmented_slot in frag_min..frag_max {\n            let object_id_u64 = self\n                .slot_to_object_id\n                .get(fragmented_slot)\n                .load(Ordering::Acquire);\n\n            if let Some(object_id) = ObjectId::new(object_id_u64) {\n                ret.push((object_id, fragmented_slot));\n            }\n        }\n\n        ret\n    }\n}\n\n#[derive(Clone)]\npub(crate) struct ObjectLocationMapper {\n    object_id_to_location: PageTable<AtomicU64>,\n    slab_tenancies: Arc<[SlabTenancy; N_SLABS]>,\n    object_id_allocator: Arc<Allocator>,\n    target_fill_ratio: f32,\n}\n\nimpl ObjectLocationMapper {\n    pub(crate) fn new(\n        recovered_metadata: &[UpdateMetadata],\n        target_fill_ratio: f32,\n    ) -> ObjectLocationMapper {\n        let mut ret = ObjectLocationMapper {\n            object_id_to_location: PageTable::default(),\n            slab_tenancies: Arc::new(core::array::from_fn(|_| {\n                SlabTenancy::default()\n            })),\n            object_id_allocator: Arc::default(),\n            target_fill_ratio,\n        };\n\n        let mut object_ids: FnvHashSet<u64> = Default::default();\n        let mut slots_per_slab: [FnvHashSet<u64>; N_SLABS] =\n            core::array::from_fn(|_| Default::default());\n\n        for update_metadata in recovered_metadata {\n            match update_metadata {\n                UpdateMetadata::Store {\n                    object_id,\n                    collection_id: _,\n                    location,\n                    low_key: _,\n                } => {\n                    object_ids.insert(**object_id);\n                    let slab_address = SlabAddress::from(*location);\n                    slots_per_slab[slab_address.slab() as usize]\n                        .insert(slab_address.slot());\n                    ret.insert(*object_id, slab_address);\n                }\n                UpdateMetadata::Free { .. } => {\n                    unreachable!()\n                }\n            }\n        }\n\n        ret.object_id_allocator =\n            Arc::new(Allocator::from_allocated(&object_ids));\n\n        let slabs = Arc::get_mut(&mut ret.slab_tenancies).unwrap();\n\n        for i in 0..N_SLABS {\n            let slab = &mut slabs[i];\n            slab.slot_allocator =\n                Arc::new(Allocator::from_allocated(&slots_per_slab[i]));\n        }\n\n        ret\n    }\n\n    pub(crate) fn get_max_allocated_per_slab(&self) -> Vec<(usize, u64)> {\n        let mut ret = vec![];\n\n        for (i, slab) in self.slab_tenancies.iter().enumerate() {\n            if let Some(max_allocated) = slab.slot_allocator.max_allocated() {\n                ret.push((i, max_allocated));\n            }\n        }\n\n        ret\n    }\n\n    pub(crate) fn stats(&self) -> AllocatorStats {\n        let (objects_allocated, objects_freed) =\n            self.object_id_allocator.counters();\n\n        let mut heap_slots_allocated = 0;\n        let mut heap_slots_freed = 0;\n\n        for slab_id in 0..N_SLABS {\n            let (allocated, freed) =\n                self.slab_tenancies[slab_id].slot_allocator.counters();\n            heap_slots_allocated += allocated;\n            heap_slots_freed += freed;\n        }\n\n        AllocatorStats {\n            objects_allocated,\n            objects_freed,\n            heap_slots_allocated,\n            heap_slots_freed,\n        }\n    }\n\n    pub(crate) fn clone_object_id_allocator_arc(&self) -> Arc<Allocator> {\n        self.object_id_allocator.clone()\n    }\n\n    pub(crate) fn allocate_object_id(&self) -> ObjectId {\n        // object IDs wrap a NonZeroU64, so if we get 0, just re-allocate and leak the id\n\n        let mut object_id = self.object_id_allocator.allocate();\n        if object_id == 0 {\n            object_id = self.object_id_allocator.allocate();\n            assert_ne!(object_id, 0);\n        }\n        ObjectId::new(object_id).unwrap()\n    }\n\n    pub(crate) fn clone_slab_allocator_arc(\n        &self,\n        slab_id: u8,\n    ) -> Arc<Allocator> {\n        self.slab_tenancies[usize::from(slab_id)].slot_allocator.clone()\n    }\n\n    pub(crate) fn allocate_slab_slot(&self, slab_id: u8) -> SlabAddress {\n        let slot =\n            self.slab_tenancies[usize::from(slab_id)].slot_allocator.allocate();\n        SlabAddress::from_slab_slot(slab_id, slot)\n    }\n\n    pub(crate) fn free_slab_slot(&self, slab_address: SlabAddress) {\n        self.slab_tenancies[usize::from(slab_address.slab())]\n            .slot_allocator\n            .free(slab_address.slot())\n    }\n\n    pub(crate) fn get_location_for_object(\n        &self,\n        object_id: ObjectId,\n    ) -> Option<crate::SlabAddress> {\n        let location_u64 =\n            self.object_id_to_location.get(*object_id).load(Ordering::Acquire);\n\n        let nzu = NonZeroU64::new(location_u64)?;\n\n        Some(SlabAddress::from(nzu))\n    }\n\n    /// Returns the previous address for this object, if it is vacating one.\n    ///\n    /// # Panics\n    ///\n    /// Asserts that the new location is actually unoccupied. This is a major\n    /// correctness violation if that isn't true.\n    pub(crate) fn insert(\n        &self,\n        object_id: ObjectId,\n        new_location: SlabAddress,\n    ) -> Option<SlabAddress> {\n        // insert into object_id_to_location\n        let location_nzu: NonZeroU64 = new_location.into();\n        let location_u64 = location_nzu.get();\n\n        let last_u64 = self\n            .object_id_to_location\n            .get(*object_id)\n            .swap(location_u64, Ordering::Release);\n\n        let last_address_opt = if let Some(nzu) = NonZeroU64::new(last_u64) {\n            let last_address = SlabAddress::from(nzu);\n            Some(last_address)\n        } else {\n            None\n        };\n\n        // insert into slab_tenancies\n        let slab = new_location.slab();\n        let slot = new_location.slot();\n\n        let _last_oid_at_location = self.slab_tenancies[usize::from(slab)]\n            .slot_to_object_id\n            .get(slot)\n            .swap(*object_id, Ordering::Release);\n\n        // TODO add debug event verifier here assert_eq!(0, last_oid_at_location);\n\n        last_address_opt\n    }\n\n    /// Unmaps an object and returns its location.\n    ///\n    /// # Panics\n    ///\n    /// Asserts that the object was actually stored in a location.\n    pub(crate) fn remove(&self, object_id: ObjectId) -> Option<SlabAddress> {\n        let last_u64 = self\n            .object_id_to_location\n            .get(*object_id)\n            .swap(0, Ordering::Release);\n\n        if let Some(nzu) = NonZeroU64::new(last_u64) {\n            let last_address = SlabAddress::from(nzu);\n\n            let slab = last_address.slab();\n            let slot = last_address.slot();\n\n            let last_oid_at_location = self.slab_tenancies[usize::from(slab)]\n                .slot_to_object_id\n                .get(slot)\n                .swap(0, Ordering::Release);\n\n            assert_eq!(*object_id, last_oid_at_location);\n\n            Some(last_address)\n        } else {\n            None\n        }\n    }\n\n    pub(crate) fn objects_to_defrag(&self) -> FnvHashSet<ObjectId> {\n        let mut ret = FnvHashSet::default();\n\n        for slab_id in 0..N_SLABS {\n            let slab = &self.slab_tenancies[slab_id];\n\n            for (object_id, slot) in\n                slab.objects_to_defrag(self.target_fill_ratio)\n            {\n                let sa = SlabAddress::from_slab_slot(\n                    u8::try_from(slab_id).unwrap(),\n                    slot,\n                );\n\n                let rt_sa = if let Some(rt_raw_sa) = NonZeroU64::new(\n                    self.object_id_to_location\n                        .get(*object_id)\n                        .load(Ordering::Acquire),\n                ) {\n                    SlabAddress::from(rt_raw_sa)\n                } else {\n                    // object has been removed but its slot has not yet been freed,\n                    // hopefully due to a deferred write\n                    // TODO test that with a testing event log\n                    continue;\n                };\n\n                if sa == rt_sa {\n                    let newly_inserted = ret.insert(object_id);\n                    assert!(\n                        newly_inserted,\n                        \"{object_id:?} present multiple times across slab objects_to_defrag\"\n                    );\n                }\n            }\n        }\n\n        ret\n    }\n}\n"
  },
  {
    "path": "src/tree.rs",
    "content": "use std::collections::{BTreeMap, VecDeque};\nuse std::fmt;\nuse std::hint;\nuse std::io;\nuse std::mem::ManuallyDrop;\nuse std::ops;\nuse std::ops::Bound;\nuse std::ops::RangeBounds;\nuse std::sync::Arc;\nuse std::sync::atomic::Ordering;\nuse std::time::Instant;\n\nuse concurrent_map::Minimum;\nuse fault_injection::annotate;\nuse inline_array::InlineArray;\nuse parking_lot::{\n    RawRwLock,\n    lock_api::{ArcRwLockReadGuard, ArcRwLockWriteGuard},\n};\n\nuse crate::*;\n\n#[cfg(feature = \"for-internal-testing-only\")]\nuse crate::block_checker::track_blocks;\n\n#[derive(Clone)]\npub struct Tree<const LEAF_FANOUT: usize = 1024> {\n    collection_id: CollectionId,\n    cache: ObjectCache<LEAF_FANOUT>,\n    pub(crate) index: Index<LEAF_FANOUT>,\n    _shutdown_dropper: Arc<ShutdownDropper<LEAF_FANOUT>>,\n}\n\nimpl<const LEAF_FANOUT: usize> Drop for Tree<LEAF_FANOUT> {\n    fn drop(&mut self) {\n        if self.cache.config.flush_every_ms.is_none() {\n            if let Err(e) = self.flush() {\n                log::error!(\"failed to flush Db on Drop: {e:?}\");\n            }\n        } else {\n            // otherwise, it is expected that the flusher thread will\n            // flush while shutting down the final Db/Tree instance\n        }\n    }\n}\n\nimpl<const LEAF_FANOUT: usize> fmt::Debug for Tree<LEAF_FANOUT> {\n    fn fmt(&self, w: &mut fmt::Formatter<'_>) -> fmt::Result {\n        let alternate = w.alternate();\n\n        let mut debug_struct = w.debug_struct(&format!(\"Db<{}>\", LEAF_FANOUT));\n\n        if alternate {\n            debug_struct\n                .field(\"global_error\", &self.check_error())\n                .field(\n                    \"data\",\n                    &format!(\"{:?}\", self.iter().collect::<Vec<_>>()),\n                )\n                .finish()\n        } else {\n            debug_struct.field(\"global_error\", &self.check_error()).finish()\n        }\n    }\n}\n\n#[must_use]\nstruct LeafReadGuard<'a, const LEAF_FANOUT: usize = 1024> {\n    leaf_read:\n        ManuallyDrop<ArcRwLockReadGuard<RawRwLock, CacheBox<LEAF_FANOUT>>>,\n    low_key: InlineArray,\n    inner: &'a Tree<LEAF_FANOUT>,\n    object_id: ObjectId,\n    external_cache_access_and_eviction: bool,\n}\n\nimpl<const LEAF_FANOUT: usize> Drop for LeafReadGuard<'_, LEAF_FANOUT> {\n    fn drop(&mut self) {\n        let size = self.leaf_read.leaf.as_ref().unwrap().in_memory_size;\n        // we must drop our mutex before calling mark_access_and_evict\n        unsafe {\n            ManuallyDrop::drop(&mut self.leaf_read);\n        }\n        if self.external_cache_access_and_eviction {\n            return;\n        }\n\n        let current_epoch = self.inner.cache.current_flush_epoch();\n\n        if let Err(e) = self.inner.cache.mark_access_and_evict(\n            self.object_id,\n            size,\n            current_epoch,\n        ) {\n            self.inner.set_error(&e);\n            log::error!(\n                \"io error while paging out dirty data: {:?} \\\n                for guard of leaf with low key {:?}\",\n                e,\n                self.low_key\n            );\n        }\n    }\n}\n\nstruct LeafWriteGuard<'a, const LEAF_FANOUT: usize = 1024> {\n    leaf_write:\n        ManuallyDrop<ArcRwLockWriteGuard<RawRwLock, CacheBox<LEAF_FANOUT>>>,\n    flush_epoch_guard: FlushEpochGuard<'a>,\n    low_key: InlineArray,\n    inner: &'a Tree<LEAF_FANOUT>,\n    node: Object<LEAF_FANOUT>,\n    external_cache_access_and_eviction: bool,\n}\n\nimpl<const LEAF_FANOUT: usize> LeafWriteGuard<'_, LEAF_FANOUT> {\n    fn epoch(&self) -> FlushEpoch {\n        self.flush_epoch_guard.epoch()\n    }\n\n    // Handling cache access involves acquiring a mutex to anything\n    // that is being paged-out so that it can be dropped. We call\n    // this for things that we want to perform cache\n    fn handle_cache_access_and_eviction_externally(\n        mut self,\n    ) -> (ObjectId, usize) {\n        self.external_cache_access_and_eviction = true;\n        (\n            self.node.object_id,\n            self.leaf_write.leaf.as_ref().unwrap().in_memory_size,\n        )\n    }\n}\n\nimpl<const LEAF_FANOUT: usize> Drop for LeafWriteGuard<'_, LEAF_FANOUT> {\n    fn drop(&mut self) {\n        let size = self.leaf_write.leaf.as_ref().unwrap().in_memory_size;\n\n        // we must drop our mutex before calling mark_access_and_evict\n        unsafe {\n            ManuallyDrop::drop(&mut self.leaf_write);\n        }\n        if self.external_cache_access_and_eviction {\n            return;\n        }\n\n        if let Err(e) = self.inner.cache.mark_access_and_evict(\n            self.node.object_id,\n            size,\n            self.epoch(),\n        ) {\n            self.inner.set_error(&e);\n            log::error!(\"io error while paging out dirty data: {:?}\", e);\n        }\n    }\n}\n\nimpl<const LEAF_FANOUT: usize> Tree<LEAF_FANOUT> {\n    pub(crate) fn new(\n        collection_id: CollectionId,\n        cache: ObjectCache<LEAF_FANOUT>,\n        index: Index<LEAF_FANOUT>,\n        _shutdown_dropper: Arc<ShutdownDropper<LEAF_FANOUT>>,\n    ) -> Tree<LEAF_FANOUT> {\n        Tree { collection_id, cache, index, _shutdown_dropper }\n    }\n\n    // This is only pub for an extra assertion during testing.\n    #[doc(hidden)]\n    pub fn check_error(&self) -> io::Result<()> {\n        self.cache.check_error()\n    }\n\n    fn set_error(&self, error: &io::Error) {\n        self.cache.set_error(error)\n    }\n\n    pub fn storage_stats(&self) -> Stats {\n        Stats { cache: self.cache.stats() }\n    }\n\n    /// Synchronously flushes all dirty IO buffers and calls\n    /// fsync. If this succeeds, it is guaranteed that all\n    /// previous writes will be recovered if the system\n    /// crashes. Returns the number of bytes flushed during\n    /// this call.\n    ///\n    /// Flushing can take quite a lot of time, and you should\n    /// measure the performance impact of using it on\n    /// realistic sustained workloads running on realistic\n    /// hardware.\n    ///\n    /// This is called automatically on drop of the last open Db\n    /// instance.\n    pub fn flush(&self) -> io::Result<FlushStats> {\n        self.cache.flush()\n    }\n\n    pub(crate) fn page_in(\n        &self,\n        key: &[u8],\n        flush_epoch: FlushEpoch,\n    ) -> io::Result<(\n        InlineArray,\n        ArcRwLockWriteGuard<RawRwLock, CacheBox<LEAF_FANOUT>>,\n        Object<LEAF_FANOUT>,\n    )> {\n        let before_read_io = Instant::now();\n\n        #[cfg(feature = \"for-internal-testing-only\")]\n        let _b0 = track_blocks();\n\n        let mut loops: u64 = 0;\n        let mut last_continue = \"none\";\n        let mut warned = false;\n\n        loop {\n            loops += 1;\n\n            if loops > 10_000_000 && !warned {\n                log::warn!(\n                    \"page_in spinning for a long time due to continue point {}\",\n                    last_continue\n                );\n                warned = true;\n\n                #[cfg(feature = \"for-internal-testing-only\")]\n                assert!(\n                    loops <= 10_000_000,\n                    \"stuck in loop at continue point: {}, search key: {:?}\",\n                    last_continue,\n                    key,\n                );\n            }\n\n            let _heap_pin = self.cache.heap_object_id_pin();\n\n            let (low_key, node) = self.index.get_lte(key).unwrap();\n            if node.collection_id != self.collection_id {\n                log::trace!(\"retry due to mismatched collection id in page_in\");\n\n                hint::spin_loop();\n\n                last_continue = concat!(file!(), ':', line!());\n\n                continue;\n            }\n\n            #[cfg(feature = \"for-internal-testing-only\")]\n            let _b1 = track_blocks();\n\n            let mut write = node.inner.write_arc();\n            if write.leaf.is_none() {\n                self.cache\n                    .read_stats\n                    .cache_misses\n                    .fetch_add(1, Ordering::Relaxed);\n\n                let leaf_bytes =\n                    if let Some(read_res) = self.cache.read(node.object_id) {\n                        match read_res {\n                            Ok(buf) => buf,\n                            Err(e) => return Err(annotate!(e)),\n                        }\n                    } else {\n                        hint::spin_loop();\n\n                        last_continue = concat!(file!(), ':', line!());\n\n                        continue;\n                    };\n\n                let read_io_latency_us =\n                    u64::try_from(before_read_io.elapsed().as_micros())\n                        .unwrap();\n                self.cache\n                    .read_stats\n                    .max_read_io_latency_us\n                    .fetch_max(read_io_latency_us, Ordering::Relaxed);\n                self.cache\n                    .read_stats\n                    .sum_read_io_latency_us\n                    .fetch_add(read_io_latency_us, Ordering::Relaxed);\n\n                let before_deserialization = Instant::now();\n\n                let leaf: Box<Leaf<LEAF_FANOUT>> =\n                    Leaf::deserialize(&leaf_bytes).unwrap();\n\n                if leaf.lo != low_key {\n                    // TODO determine why this rare situation occurs and better\n                    // understand whether it is really benign.\n                    log::trace!(\"mismatch between object key and leaf low\");\n\n                    hint::spin_loop();\n\n                    last_continue = concat!(file!(), ':', line!());\n\n                    continue;\n                }\n\n                let deserialization_latency_us =\n                    u64::try_from(before_deserialization.elapsed().as_micros())\n                        .unwrap();\n                self.cache\n                    .read_stats\n                    .max_deserialization_latency_us\n                    .fetch_max(deserialization_latency_us, Ordering::Relaxed);\n                self.cache\n                    .read_stats\n                    .sum_deserialization_latency_us\n                    .fetch_add(deserialization_latency_us, Ordering::Relaxed);\n\n                #[cfg(feature = \"for-internal-testing-only\")]\n                {\n                    self.cache.event_verifier.mark(\n                        node.object_id,\n                        FlushEpoch::MIN,\n                        event_verifier::State::CleanPagedIn,\n                        concat!(file!(), ':', line!(), \":page-in\"),\n                    );\n                }\n\n                write.leaf = Some(leaf);\n            } else {\n                self.cache\n                    .read_stats\n                    .cache_hits\n                    .fetch_add(1, Ordering::Relaxed);\n            }\n            let leaf = write.leaf.as_mut().unwrap();\n\n            if leaf.deleted.is_some() {\n                log::trace!(\"retry due to deleted node in page_in\");\n                drop(write);\n\n                hint::spin_loop();\n\n                last_continue = concat!(file!(), ':', line!());\n\n                continue;\n            }\n\n            if &*leaf.lo > key {\n                let size = leaf.in_memory_size;\n                drop(write);\n                log::trace!(\"key undershoot in page_in\");\n                self.cache.mark_access_and_evict(\n                    node.object_id,\n                    size,\n                    flush_epoch,\n                )?;\n\n                hint::spin_loop();\n\n                last_continue = concat!(file!(), ':', line!());\n\n                continue;\n            }\n\n            if let Some(ref hi) = leaf.hi {\n                if &**hi <= key {\n                    let size = leaf.in_memory_size;\n                    log::trace!(\n                        \"key overshoot in page_in - search key {:?}, node hi {:?}\",\n                        key,\n                        hi\n                    );\n                    drop(write);\n                    self.cache.mark_access_and_evict(\n                        node.object_id,\n                        size,\n                        flush_epoch,\n                    )?;\n\n                    hint::spin_loop();\n\n                    last_continue = concat!(file!(), ':', line!());\n\n                    continue;\n                }\n            }\n            return Ok((low_key, write, node));\n        }\n    }\n\n    // NB: must never be called without having already added the empty leaf\n    // operations to a normal flush epoch. This function acquires the lock\n    // for the left sibling so that the empty leaf's hi key can be given\n    // to the left sibling, but for this to happen atomically, the act of\n    // moving left must \"happen\" in the same flush epoch. By \"pushing\" the\n    // merge left potentially into a future flush epoch, any deletions that the\n    // leaf had applied that may have been a part of a previous batch would also\n    // be pushed into the future flush epoch, which would break the crash\n    // atomicity of the batch if the updates were not flushed in the same epoch\n    // as the rest of the batch. So, this is why we potentially separate the\n    // flush of the left merge from the flush of the operations that caused\n    // the leaf to empty in the first place.\n    fn merge_leaf_into_right_sibling<'a>(\n        &'a self,\n        mut predecessor: LeafWriteGuard<'a, LEAF_FANOUT>,\n    ) -> io::Result<()> {\n        #[cfg(feature = \"for-internal-testing-only\")]\n        let _b1 = track_blocks();\n\n        let mut successor = self.successor_leaf_mut(&predecessor)?;\n\n        // This should be true because we acquire the successor\n        // write mutex after acquiring the predecessor's.\n        assert!(successor.epoch() >= predecessor.epoch());\n\n        let merge_epoch = predecessor.epoch().max(successor.epoch());\n\n        let predecessor_epoch = predecessor.epoch();\n\n        let predecessor_leaf = predecessor.leaf_write.leaf.as_mut().unwrap();\n        let successor_leaf = successor.leaf_write.leaf.as_mut().unwrap();\n\n        assert!(predecessor_leaf.deleted.is_none());\n        assert!(predecessor_leaf.is_empty());\n        assert!(successor_leaf.deleted.is_none());\n        assert_eq!(\n            predecessor_leaf.hi.as_deref(),\n            Some(successor_leaf.lo.as_ref()),\n        );\n\n        log::trace!(\n            \"merging empty predecessor node id {} with low key {:?} and high key {:?} \\\n            and successor node id {} with low key {:?} and high key {:?} into the \\\n            predecessor\",\n            predecessor.node.object_id.0,\n            predecessor_leaf.lo,\n            predecessor_leaf.hi,\n            successor.node.object_id.0,\n            successor_leaf.lo,\n            successor_leaf.hi\n        );\n\n        if merge_epoch != predecessor_epoch {\n            // need to cooperatively serialize predecessor so that whatever\n            // writes caused it to be empty in the first place are atomically\n            // persisted with the rest of any batch that may have caused that.\n            self.cooperatively_serialize_leaf(\n                predecessor.node.object_id,\n                &mut *predecessor_leaf,\n            );\n        }\n\n        predecessor_leaf.set_dirty_epoch(merge_epoch);\n        predecessor_leaf.merge_from(successor_leaf.as_mut());\n\n        successor_leaf.deleted = Some(merge_epoch);\n\n        successor\n            .inner\n            .cache\n            .tree_leaves_merged\n            .fetch_add(1, Ordering::Relaxed);\n\n        assert_eq!(successor.low_key, successor_leaf.lo);\n        assert_eq!(predecessor.low_key, predecessor_leaf.lo);\n\n        self.index.remove(&successor.low_key).unwrap();\n        self.cache.object_id_index.remove(&successor.node.object_id).unwrap();\n\n        // NB: these updates must \"happen\" atomically in the same flush epoch\n        self.cache.install_dirty(\n            merge_epoch,\n            successor.node.object_id,\n            Dirty::MergedAndDeleted {\n                object_id: successor.node.object_id,\n                collection_id: self.collection_id,\n            },\n        );\n\n        #[cfg(feature = \"for-internal-testing-only\")]\n        {\n            self.cache.event_verifier.mark(\n                successor.node.object_id,\n                merge_epoch,\n                event_verifier::State::Unallocated,\n                concat!(file!(), ':', line!(), \":merged\"),\n            );\n        }\n\n        self.cache.install_dirty(\n            merge_epoch,\n            predecessor.node.object_id,\n            Dirty::NotYetSerialized {\n                low_key: predecessor_leaf.lo.clone(),\n                node: predecessor.node.clone(),\n                collection_id: self.collection_id,\n            },\n        );\n\n        #[cfg(feature = \"for-internal-testing-only\")]\n        {\n            self.cache.event_verifier.mark(\n                predecessor.node.object_id,\n                merge_epoch,\n                event_verifier::State::Dirty,\n                concat!(file!(), ':', line!(), \":merged-into\"),\n            );\n        }\n\n        let (p_object_id, p_sz) =\n            predecessor.handle_cache_access_and_eviction_externally();\n        let (s_object_id, s_sz) =\n            successor.handle_cache_access_and_eviction_externally();\n\n        self.cache.mark_access_and_evict(p_object_id, p_sz, merge_epoch)?;\n        self.cache.mark_access_and_evict(s_object_id, s_sz, merge_epoch)?;\n\n        Ok(())\n    }\n\n    fn successor_leaf_mut<'a>(\n        &'a self,\n        predecessor: &LeafWriteGuard<'a, LEAF_FANOUT>,\n    ) -> io::Result<LeafWriteGuard<'a, LEAF_FANOUT>> {\n        let predecessor_leaf = predecessor.leaf_write.leaf.as_ref().unwrap();\n        assert!(predecessor_leaf.hi.is_some());\n\n        #[cfg(feature = \"for-internal-testing-only\")]\n        let _b0 = track_blocks();\n\n        loop {\n            let search_key = predecessor_leaf.hi.as_ref().unwrap();\n\n            #[cfg(feature = \"for-internal-testing-only\")]\n            let _b1 = track_blocks();\n\n            let successor_node = self.leaf_for_key_mut(search_key)?;\n\n            let successor_leaf =\n                successor_node.leaf_write.leaf.as_ref().unwrap();\n\n            assert!(predecessor_leaf.lo < successor_leaf.lo);\n\n            if predecessor_leaf.hi.as_ref().unwrap() > &successor_leaf.lo {\n                let still_in_index = self.index.get(&successor_leaf.lo);\n                panic!(\n                    \"somehow, predecessor high key of {:?} \\\n                    is greater than successor low key of {:?}. current index presence: {:?} \\n \\\n                    predecessor: {:?} \\n successor: {:?}\",\n                    predecessor_leaf.hi,\n                    successor_leaf.lo,\n                    still_in_index,\n                    predecessor_leaf,\n                    successor_leaf,\n                );\n            }\n            if predecessor_leaf.hi.as_ref().unwrap() != &successor_leaf.lo {\n                continue;\n            }\n            return Ok(successor_node);\n        }\n    }\n\n    fn cooperatively_serialize_leaf(\n        &self,\n        object_id: ObjectId,\n        leaf: &mut Leaf<LEAF_FANOUT>,\n    ) {\n        // cooperatively serialize and put into dirty\n        let old_dirty_epoch = leaf.dirty_flush_epoch.take().unwrap();\n        assert!(Some(old_dirty_epoch) > leaf.max_unflushed_epoch);\n        leaf.max_unflushed_epoch = Some(old_dirty_epoch);\n        leaf.page_out_on_flush.take();\n\n        log::trace!(\n            \"cooperatively serializing leaf id {:?} with low key {:?}\",\n            object_id,\n            leaf.lo\n        );\n\n        #[cfg(feature = \"for-internal-testing-only\")]\n        {\n            self.cache.event_verifier.mark(\n                object_id,\n                old_dirty_epoch,\n                event_verifier::State::CooperativelySerialized,\n                concat!(file!(), ':', line!(), \":cooperative-serialize\"),\n            );\n        }\n\n        // be extra-explicit about serialized bytes\n        let leaf_ref: &Leaf<LEAF_FANOUT> = &*leaf;\n\n        let serialized =\n            leaf_ref.serialize(self.cache.config.zstd_compression_level);\n\n        log::trace!(\n            \"D adding node {} to dirty {:?}\",\n            object_id.0,\n            old_dirty_epoch\n        );\n\n        self.cache.install_dirty(\n            old_dirty_epoch,\n            object_id,\n            Dirty::CooperativelySerialized {\n                object_id,\n                collection_id: self.collection_id,\n                low_key: leaf.lo.clone(),\n                mutation_count: leaf.mutation_count,\n                data: Arc::new(serialized),\n            },\n        );\n    }\n\n    fn leaf_for_key<'a>(\n        &'a self,\n        key: &[u8],\n    ) -> io::Result<LeafReadGuard<'a, LEAF_FANOUT>> {\n        #[cfg(feature = \"for-internal-testing-only\")]\n        let _b0 = track_blocks();\n\n        loop {\n            let (low_key, node) = self.index.get_lte(key).unwrap();\n\n            #[cfg(feature = \"for-internal-testing-only\")]\n            let _b1 = track_blocks();\n\n            let mut read = node.inner.read_arc();\n\n            if read.leaf.is_none() {\n                drop(read);\n                let (read_low_key, write, _node) =\n                    self.page_in(key, self.cache.current_flush_epoch())?;\n                assert!(&*read_low_key <= key);\n                read = ArcRwLockWriteGuard::downgrade(write);\n            } else {\n                self.cache\n                    .read_stats\n                    .cache_hits\n                    .fetch_add(1, Ordering::Relaxed);\n            }\n\n            let leaf_guard = LeafReadGuard {\n                leaf_read: ManuallyDrop::new(read),\n                inner: self,\n                low_key,\n                object_id: node.object_id,\n                external_cache_access_and_eviction: false,\n            };\n\n            let leaf = leaf_guard.leaf_read.leaf.as_ref().unwrap();\n\n            if leaf.deleted.is_some() {\n                log::trace!(\"retry due to deleted node in leaf_for_key\");\n                drop(leaf_guard);\n                hint::spin_loop();\n                continue;\n            }\n            if &*leaf.lo > key {\n                log::trace!(\"key undershoot in leaf_for_key\");\n                drop(leaf_guard);\n                hint::spin_loop();\n\n                continue;\n            }\n            if let Some(ref hi) = leaf.hi {\n                if &**hi <= key {\n                    log::trace!(\"key overshoot on leaf_for_key\");\n                    // cache maintenance occurs in Drop for LeafReadGuard\n                    drop(leaf_guard);\n                    hint::spin_loop();\n                    continue;\n                }\n            }\n\n            if leaf.lo != node.low_key {\n                // TODO determine why this rare situation occurs and better\n                // understand whether it is really benign.\n                log::trace!(\"mismatch between object key and leaf low\");\n                // cache maintenance occurs in Drop for LeafReadGuard\n                drop(leaf_guard);\n                hint::spin_loop();\n                continue;\n            }\n\n            assert_eq!(node.low_key, leaf.lo);\n\n            return Ok(leaf_guard);\n        }\n    }\n\n    fn leaf_for_key_mut<'a>(\n        &'a self,\n        key: &[u8],\n    ) -> io::Result<LeafWriteGuard<'a, LEAF_FANOUT>> {\n        let reader_epoch = self.cache.current_flush_epoch();\n\n        let (low_key, mut write, node) = self.page_in(key, reader_epoch)?;\n\n        // by checking into an epoch after acquiring the node mutex, we\n        // avoid inversions where progress may be observed to go backwards.\n        let flush_epoch_guard = self.cache.check_into_flush_epoch();\n\n        let leaf = write.leaf.as_mut().unwrap();\n\n        // NB: these invariants should be enforced in page_in\n        assert!(leaf.deleted.is_none());\n        assert!(&*leaf.lo <= key);\n        if let Some(ref hi) = leaf.hi {\n            assert!(\n                &**hi > key,\n                \"while retrieving the leaf for key {:?} \\\n                we pulled a leaf with hi key of {:?}\",\n                key,\n                hi\n            );\n        }\n\n        if let Some(max_unflushed_epoch) = leaf.max_unflushed_epoch {\n            // We already serialized something for this epoch, so if we did so again,\n            // we need to think a bit.\n            assert_ne!(max_unflushed_epoch, flush_epoch_guard.epoch());\n        }\n\n        if let Some(old_dirty_epoch) = leaf.dirty_flush_epoch {\n            if old_dirty_epoch != flush_epoch_guard.epoch() {\n                assert!(old_dirty_epoch < flush_epoch_guard.epoch());\n\n                log::trace!(\n                    \"cooperatively flushing {:?} with dirty {:?} after checking into {:?}\",\n                    node.object_id,\n                    old_dirty_epoch,\n                    flush_epoch_guard.epoch()\n                );\n\n                self.cooperatively_serialize_leaf(node.object_id, &mut *leaf);\n            }\n        }\n\n        Ok(LeafWriteGuard {\n            flush_epoch_guard,\n            leaf_write: ManuallyDrop::new(write),\n            inner: self,\n            low_key,\n            node,\n            external_cache_access_and_eviction: false,\n        })\n    }\n\n    /// Retrieve a value from the `Tree` if it exists.\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// # let config = sled::Config::tmp().unwrap();\n    /// # let db: sled::Db<1024> = config.open()?;\n    /// db.insert(&[0], vec![0])?;\n    /// assert_eq!(db.get(&[0]).unwrap(), Some(sled::InlineArray::from(vec![0])));\n    /// assert!(db.get(&[1]).unwrap().is_none());\n    /// # Ok(()) }\n    /// ```\n    pub fn get<K: AsRef<[u8]>>(\n        &self,\n        key: K,\n    ) -> io::Result<Option<InlineArray>> {\n        self.check_error()?;\n\n        let key_ref = key.as_ref();\n\n        let leaf_guard = self.leaf_for_key(key_ref)?;\n\n        let leaf = leaf_guard.leaf_read.leaf.as_ref().unwrap();\n\n        if let Some(ref hi) = leaf.hi {\n            assert!(&**hi > key_ref);\n        }\n\n        Ok(leaf.get(key_ref).cloned())\n    }\n\n    /// Insert a key to a new value, returning the last value if it\n    /// was set.\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// # let config = sled::Config::tmp().unwrap();\n    /// # let db: sled::Db<1024> = config.open()?;\n    /// assert_eq!(db.insert(&[1, 2, 3], vec![0]).unwrap(), None);\n    /// assert_eq!(db.insert(&[1, 2, 3], vec![1]).unwrap(), Some(sled::InlineArray::from(&[0])));\n    /// # Ok(()) }\n    /// ```\n    #[doc(alias = \"set\")]\n    #[doc(alias = \"put\")]\n    pub fn insert<K, V>(\n        &self,\n        key: K,\n        value: V,\n    ) -> io::Result<Option<InlineArray>>\n    where\n        K: AsRef<[u8]>,\n        V: Into<InlineArray>,\n    {\n        self.check_error()?;\n\n        let key_ref = key.as_ref();\n\n        let value_ivec = value.into();\n        let mut leaf_guard = self.leaf_for_key_mut(key_ref)?;\n        let new_epoch = leaf_guard.epoch();\n\n        let leaf = leaf_guard.leaf_write.leaf.as_mut().unwrap();\n\n        let ret = leaf.insert(key_ref.into(), value_ivec.clone());\n\n        let old_size =\n            ret.as_ref().map(|v| key_ref.len() + v.len()).unwrap_or(0);\n        let new_size = key_ref.len() + value_ivec.len();\n\n        if new_size > old_size {\n            leaf.in_memory_size += new_size - old_size;\n        } else {\n            leaf.in_memory_size =\n                leaf.in_memory_size.saturating_sub(old_size - new_size);\n        }\n\n        let split =\n            leaf.split_if_full(new_epoch, &self.cache, self.collection_id);\n        if split.is_some() || Some(value_ivec) != ret {\n            leaf.mutation_count += 1;\n            leaf.set_dirty_epoch(new_epoch);\n            log::trace!(\n                \"F adding node {} to dirty {:?}\",\n                leaf_guard.node.object_id.0,\n                new_epoch\n            );\n\n            self.cache.install_dirty(\n                new_epoch,\n                leaf_guard.node.object_id,\n                Dirty::NotYetSerialized {\n                    collection_id: self.collection_id,\n                    node: leaf_guard.node.clone(),\n                    low_key: leaf_guard.low_key.clone(),\n                },\n            );\n\n            #[cfg(feature = \"for-internal-testing-only\")]\n            {\n                self.cache.event_verifier.mark(\n                    leaf_guard.node.object_id,\n                    new_epoch,\n                    event_verifier::State::Dirty,\n                    concat!(file!(), ':', line!(), \":insert\"),\n                );\n            }\n        }\n        if let Some((split_key, rhs_node)) = split {\n            assert_eq!(leaf.hi.as_ref().unwrap(), &split_key);\n            log::trace!(\n                \"G adding new from split {:?} to dirty {:?}\",\n                rhs_node.object_id,\n                new_epoch\n            );\n\n            assert_ne!(rhs_node.object_id, leaf_guard.node.object_id);\n            assert!(!split_key.is_empty());\n\n            let rhs_object_id = rhs_node.object_id;\n\n            self.cache.install_dirty(\n                new_epoch,\n                rhs_object_id,\n                Dirty::NotYetSerialized {\n                    collection_id: self.collection_id,\n                    node: rhs_node.clone(),\n                    low_key: split_key.clone(),\n                },\n            );\n\n            #[cfg(feature = \"for-internal-testing-only\")]\n            {\n                self.cache.event_verifier.mark(\n                    rhs_object_id,\n                    new_epoch,\n                    event_verifier::State::Dirty,\n                    concat!(file!(), ':', line!(), \":insert-split\"),\n                );\n            }\n\n            // NB only make the new node reachable via the index after\n            // we marked it as dirty, as from this point on, any other\n            // thread may cooperatively deserialize it and maybe conflict\n            // with that previous NotYetSerialized marker.\n            self.cache\n                .object_id_index\n                .insert(rhs_node.object_id, rhs_node.clone());\n            let prev = self.index.insert(split_key, rhs_node);\n            assert!(prev.is_none());\n        }\n\n        // this is for clarity, that leaf_guard is held while\n        // inserting into dirty with its guarded epoch\n        drop(leaf_guard);\n\n        Ok(ret)\n    }\n\n    /// Delete a value, returning the old value if it existed.\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// # let config = sled::Config::tmp().unwrap();\n    /// # let db: sled::Db<1024> = config.open()?;\n    /// db.insert(&[1], vec![1]);\n    /// assert_eq!(db.remove(&[1]).unwrap(), Some(sled::InlineArray::from(vec![1])));\n    /// assert!(db.remove(&[1]).unwrap().is_none());\n    /// # Ok(()) }\n    /// ```\n    #[doc(alias = \"delete\")]\n    #[doc(alias = \"del\")]\n    pub fn remove<K: AsRef<[u8]>>(\n        &self,\n        key: K,\n    ) -> io::Result<Option<InlineArray>> {\n        self.check_error()?;\n\n        let key_ref = key.as_ref();\n\n        let mut leaf_guard = self.leaf_for_key_mut(key_ref)?;\n\n        let new_epoch = leaf_guard.epoch();\n\n        let leaf = leaf_guard.leaf_write.leaf.as_mut().unwrap();\n\n        assert!(leaf.deleted.is_none());\n\n        let ret = leaf.remove(key_ref);\n\n        if ret.is_some() {\n            leaf.mutation_count += 1;\n\n            leaf.set_dirty_epoch(new_epoch);\n\n            log::trace!(\n                \"H adding node {} to dirty {:?}\",\n                leaf_guard.node.object_id.0,\n                new_epoch\n            );\n\n            self.cache.install_dirty(\n                new_epoch,\n                leaf_guard.node.object_id,\n                Dirty::NotYetSerialized {\n                    collection_id: self.collection_id,\n                    low_key: leaf_guard.low_key.clone(),\n                    node: leaf_guard.node.clone(),\n                },\n            );\n\n            #[cfg(feature = \"for-internal-testing-only\")]\n            {\n                self.cache.event_verifier.mark(\n                    leaf_guard.node.object_id,\n                    new_epoch,\n                    event_verifier::State::Dirty,\n                    concat!(file!(), ':', line!(), \":remove\"),\n                );\n            }\n\n            if cfg!(not(feature = \"monotonic-behavior\"))\n                && leaf.is_empty()\n                && leaf.hi.is_some()\n            {\n                self.merge_leaf_into_right_sibling(leaf_guard)?;\n            }\n        }\n\n        Ok(ret)\n    }\n    /// Compare and swap. Capable of unique creation, conditional modification,\n    /// or deletion. If old is `None`, this will only set the value if it\n    /// doesn't exist yet. If new is `None`, will delete the value if old is\n    /// correct. If both old and new are `Some`, will modify the value if\n    /// old is correct.\n    ///\n    /// It returns `Ok(Ok(CompareAndSwapSuccess { new_value, previous_value }))` if operation finishes successfully.\n    ///\n    /// If it fails it returns:\n    ///     - `Ok(Err(CompareAndSwapError{ current, proposed }))` if no IO\n    ///       error was encountered but the operation\n    ///       failed to specify the correct current value. `CompareAndSwapError` contains\n    ///       current and proposed values.\n    ///     - `Err(io::Error)` if there was a high-level IO problem that prevented\n    ///       the operation from logically progressing. This is usually fatal and\n    ///       will prevent future requests from functioning, and requires the\n    ///       administrator to fix the system issue before restarting.\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// # let config = sled::Config::tmp().unwrap();\n    /// # let db: sled::Db<1024> = config.open()?;\n    /// // unique creation\n    /// assert!(\n    ///     db.compare_and_swap(&[1], None as Option<&[u8]>, Some(&[10])).unwrap().is_ok(),\n    /// );\n    ///\n    /// // conditional modification\n    /// assert!(\n    ///     db.compare_and_swap(&[1], Some(&[10]), Some(&[20])).unwrap().is_ok(),\n    /// );\n    ///\n    /// // failed conditional modification -- the current value is returned in\n    /// // the error variant\n    /// let operation = db.compare_and_swap(&[1], Some(&[30]), Some(&[40]));\n    /// assert!(operation.is_ok()); // the operation succeeded\n    /// let modification = operation.unwrap();\n    /// assert!(modification.is_err());\n    /// let actual_value = modification.unwrap_err();\n    /// assert_eq!(actual_value.current.map(|ivec| ivec.to_vec()), Some(vec![20]));\n    ///\n    /// // conditional deletion\n    /// assert!(\n    ///     db.compare_and_swap(&[1], Some(&[20]), None as Option<&[u8]>).unwrap().is_ok(),\n    /// );\n    /// assert!(db.get(&[1]).unwrap().is_none());\n    /// # Ok(()) }\n    /// ```\n    #[doc(alias = \"cas\")]\n    #[doc(alias = \"tas\")]\n    #[doc(alias = \"test_and_swap\")]\n    #[doc(alias = \"compare_and_set\")]\n    pub fn compare_and_swap<K, OV, NV>(\n        &self,\n        key: K,\n        old: Option<OV>,\n        new: Option<NV>,\n    ) -> CompareAndSwapResult\n    where\n        K: AsRef<[u8]>,\n        OV: AsRef<[u8]>,\n        NV: Into<InlineArray>,\n    {\n        self.check_error()?;\n\n        let key_ref = key.as_ref();\n\n        let mut leaf_guard = self.leaf_for_key_mut(key_ref)?;\n        let new_epoch = leaf_guard.epoch();\n\n        let proposed: Option<InlineArray> = new.map(Into::into);\n\n        let leaf = leaf_guard.leaf_write.leaf.as_mut().unwrap();\n\n        let current = leaf.get(key_ref).cloned();\n\n        let previous_matches = match (old, &current) {\n            (None, None) => true,\n            (Some(conditional), Some(current))\n                if conditional.as_ref() == current.as_ref() =>\n            {\n                true\n            }\n            _ => false,\n        };\n\n        let ret = if previous_matches {\n            if let Some(ref new_value) = proposed {\n                leaf.insert(key_ref.into(), new_value.clone())\n            } else {\n                leaf.remove(key_ref)\n            };\n\n            Ok(CompareAndSwapSuccess {\n                new_value: proposed,\n                previous_value: current,\n            })\n        } else {\n            Err(CompareAndSwapError { current, proposed })\n        };\n\n        let split =\n            leaf.split_if_full(new_epoch, &self.cache, self.collection_id);\n        let split_happened = split.is_some();\n        if split_happened || ret.is_ok() {\n            leaf.mutation_count += 1;\n\n            leaf.set_dirty_epoch(new_epoch);\n            log::trace!(\n                \"A adding node {} to dirty {:?}\",\n                leaf_guard.node.object_id.0,\n                new_epoch\n            );\n\n            self.cache.install_dirty(\n                new_epoch,\n                leaf_guard.node.object_id,\n                Dirty::NotYetSerialized {\n                    collection_id: self.collection_id,\n                    node: leaf_guard.node.clone(),\n                    low_key: leaf_guard.low_key.clone(),\n                },\n            );\n\n            #[cfg(feature = \"for-internal-testing-only\")]\n            {\n                self.cache.event_verifier.mark(\n                    leaf_guard.node.object_id,\n                    new_epoch,\n                    event_verifier::State::Dirty,\n                    concat!(file!(), ':', line!(), \":cas\"),\n                );\n            }\n        }\n        if let Some((split_key, rhs_node)) = split {\n            log::trace!(\n                \"B adding new from split {:?} to dirty {:?}\",\n                rhs_node.object_id,\n                new_epoch\n            );\n            self.cache.install_dirty(\n                new_epoch,\n                rhs_node.object_id,\n                Dirty::NotYetSerialized {\n                    collection_id: self.collection_id,\n                    node: rhs_node.clone(),\n                    low_key: split_key.clone(),\n                },\n            );\n\n            #[cfg(feature = \"for-internal-testing-only\")]\n            {\n                self.cache.event_verifier.mark(\n                    rhs_node.object_id,\n                    new_epoch,\n                    event_verifier::State::Dirty,\n                    concat!(file!(), ':', line!(), \"cas-split\"),\n                );\n            }\n\n            // NB only make the new node reachable via the index after\n            // we marked it as dirty, as from this point on, any other\n            // thread may cooperatively deserialize it and maybe conflict\n            // with that previous NotYetSerialized marker.\n            self.cache\n                .object_id_index\n                .insert(rhs_node.object_id, rhs_node.clone());\n            let prev = self.index.insert(split_key, rhs_node);\n            assert!(prev.is_none());\n        }\n\n        if cfg!(not(feature = \"monotonic-behavior\"))\n            && leaf.is_empty()\n            && leaf.hi.is_some()\n        {\n            assert!(!split_happened);\n            self.merge_leaf_into_right_sibling(leaf_guard)?;\n        }\n\n        Ok(ret)\n    }\n\n    /// Fetch the value, apply a function to it and return the result.\n    ///\n    /// # Note\n    ///\n    /// This may call the function multiple times if the value has been\n    /// changed from other threads in the meantime.\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// use sled::{Config, InlineArray};\n    ///\n    /// let config = Config::tmp().unwrap();\n    /// let db: sled::Db<1024> = config.open()?;\n    ///\n    /// fn u64_to_ivec(number: u64) -> InlineArray {\n    ///     InlineArray::from(number.to_be_bytes().to_vec())\n    /// }\n    ///\n    /// let zero = u64_to_ivec(0);\n    /// let one = u64_to_ivec(1);\n    /// let two = u64_to_ivec(2);\n    /// let three = u64_to_ivec(3);\n    ///\n    /// fn increment(old: Option<&[u8]>) -> Option<Vec<u8>> {\n    ///     let number = match old {\n    ///         Some(bytes) => {\n    ///             let array: [u8; 8] = bytes.try_into().unwrap();\n    ///             let number = u64::from_be_bytes(array);\n    ///             number + 1\n    ///         }\n    ///         None => 0,\n    ///     };\n    ///\n    ///     Some(number.to_be_bytes().to_vec())\n    /// }\n    ///\n    /// assert_eq!(db.update_and_fetch(\"counter\", increment).unwrap(), Some(zero));\n    /// assert_eq!(db.update_and_fetch(\"counter\", increment).unwrap(), Some(one));\n    /// assert_eq!(db.update_and_fetch(\"counter\", increment).unwrap(), Some(two));\n    /// assert_eq!(db.update_and_fetch(\"counter\", increment).unwrap(), Some(three));\n    /// # Ok(()) }\n    /// ```\n    pub fn update_and_fetch<K, V, F>(\n        &self,\n        key: K,\n        mut f: F,\n    ) -> io::Result<Option<InlineArray>>\n    where\n        K: AsRef<[u8]>,\n        F: FnMut(Option<&[u8]>) -> Option<V>,\n        V: Into<InlineArray>,\n    {\n        let key_ref = key.as_ref();\n        let mut current = self.get(key_ref)?;\n\n        loop {\n            let tmp = current.as_ref().map(AsRef::as_ref);\n            let next = f(tmp).map(Into::into);\n            match self.compare_and_swap::<_, _, InlineArray>(\n                key_ref,\n                tmp,\n                next.clone(),\n            )? {\n                Ok(_) => return Ok(next),\n                Err(CompareAndSwapError { current: cur, .. }) => {\n                    current = cur;\n                }\n            }\n        }\n    }\n\n    /// Fetch the value, apply a function to it and return the previous value.\n    ///\n    /// # Note\n    ///\n    /// This may call the function multiple times if the value has been\n    /// changed from other threads in the meantime.\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// use sled::{Config, InlineArray};\n    ///\n    /// let config = Config::tmp().unwrap();\n    /// let db: sled::Db<1024> = config.open()?;\n    ///\n    /// fn u64_to_ivec(number: u64) -> InlineArray {\n    ///     InlineArray::from(number.to_be_bytes().to_vec())\n    /// }\n    ///\n    /// let zero = u64_to_ivec(0);\n    /// let one = u64_to_ivec(1);\n    /// let two = u64_to_ivec(2);\n    ///\n    /// fn increment(old: Option<&[u8]>) -> Option<Vec<u8>> {\n    ///     let number = match old {\n    ///         Some(bytes) => {\n    ///             let array: [u8; 8] = bytes.try_into().unwrap();\n    ///             let number = u64::from_be_bytes(array);\n    ///             number + 1\n    ///         }\n    ///         None => 0,\n    ///     };\n    ///\n    ///     Some(number.to_be_bytes().to_vec())\n    /// }\n    ///\n    /// assert_eq!(db.fetch_and_update(\"counter\", increment).unwrap(), None);\n    /// assert_eq!(db.fetch_and_update(\"counter\", increment).unwrap(), Some(zero));\n    /// assert_eq!(db.fetch_and_update(\"counter\", increment).unwrap(), Some(one));\n    /// assert_eq!(db.fetch_and_update(\"counter\", increment).unwrap(), Some(two));\n    /// # Ok(()) }\n    /// ```\n    pub fn fetch_and_update<K, V, F>(\n        &self,\n        key: K,\n        mut f: F,\n    ) -> io::Result<Option<InlineArray>>\n    where\n        K: AsRef<[u8]>,\n        F: FnMut(Option<&[u8]>) -> Option<V>,\n        V: Into<InlineArray>,\n    {\n        let key_ref = key.as_ref();\n        let mut current = self.get(key_ref)?;\n\n        loop {\n            let tmp = current.as_ref().map(AsRef::as_ref);\n            let next = f(tmp);\n            match self.compare_and_swap(key_ref, tmp, next)? {\n                Ok(_) => return Ok(current),\n                Err(CompareAndSwapError { current: cur, .. }) => {\n                    current = cur;\n                }\n            }\n        }\n    }\n\n    pub fn iter(&self) -> Iter<LEAF_FANOUT> {\n        Iter {\n            prefetched: VecDeque::new(),\n            prefetched_back: VecDeque::new(),\n            next_fetch: Some(InlineArray::MIN),\n            next_back_last_lo: None,\n            next_calls: 0,\n            next_back_calls: 0,\n            inner: self.clone(),\n            bounds: (Bound::Unbounded, Bound::Unbounded),\n        }\n    }\n\n    pub fn range<K, R>(&self, range: R) -> Iter<LEAF_FANOUT>\n    where\n        K: AsRef<[u8]>,\n        R: RangeBounds<K>,\n    {\n        let start: Bound<InlineArray> =\n            map_bound(range.start_bound(), |b| InlineArray::from(b.as_ref()));\n        let end: Bound<InlineArray> =\n            map_bound(range.end_bound(), |b| InlineArray::from(b.as_ref()));\n\n        let next_fetch = Some(match &start {\n            Bound::Included(b) | Bound::Excluded(b) => b.clone(),\n            Bound::Unbounded => InlineArray::MIN,\n        });\n\n        Iter {\n            prefetched: VecDeque::new(),\n            prefetched_back: VecDeque::new(),\n            next_fetch,\n            next_back_last_lo: None,\n            next_calls: 0,\n            next_back_calls: 0,\n            inner: self.clone(),\n            bounds: (start, end),\n        }\n    }\n\n    /// Create a new batched update that is applied\n    /// atomically. Readers will atomically see all updates\n    /// at an atomic instant, and if the database crashes,\n    /// either 0% or 100% of the full batch will be recovered,\n    /// but never a partial batch. If a `flush` operation succeeds\n    /// after this, it is guaranteed that 100% of the batch will be\n    /// visible, unless later concurrent updates changed the values\n    /// before the flush.\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// # let _ = std::fs::remove_dir_all(\"batch_doctest\");\n    /// # let db: sled::Db<1024> = sled::open(\"batch_doctest\")?;\n    /// db.insert(\"key_0\", \"val_0\")?;\n    ///\n    /// let mut batch = sled::Batch::default();\n    /// batch.insert(\"key_a\", \"val_a\");\n    /// batch.insert(\"key_b\", \"val_b\");\n    /// batch.insert(\"key_c\", \"val_c\");\n    /// batch.remove(\"key_0\");\n    ///\n    /// db.apply_batch(batch)?;\n    /// // key_0 no longer exists, and key_a, key_b, and key_c\n    /// // now do exist.\n    /// # let _ = std::fs::remove_dir_all(\"batch_doctest\");\n    /// # Ok(()) }\n    /// ```\n    pub fn apply_batch(&self, batch: Batch) -> io::Result<()> {\n        // NB: we rely on lexicographic lock acquisition\n        // by iterating over the batch's BTreeMap to avoid\n        // deadlocks during 2PL\n        let mut acquired_locks: BTreeMap<\n            InlineArray,\n            (\n                ArcRwLockWriteGuard<RawRwLock, CacheBox<LEAF_FANOUT>>,\n                Object<LEAF_FANOUT>,\n            ),\n        > = BTreeMap::new();\n\n        // Phase 1: lock acquisition\n        let mut last: Option<(\n            InlineArray,\n            ArcRwLockWriteGuard<RawRwLock, CacheBox<LEAF_FANOUT>>,\n            Object<LEAF_FANOUT>,\n        )> = None;\n\n        for key in batch.writes.keys() {\n            if let Some((_lo, w, _id)) = &last {\n                let leaf = w.leaf.as_ref().unwrap();\n                assert!(&leaf.lo <= key);\n                if let Some(hi) = &leaf.hi {\n                    if hi <= key {\n                        let (lo, w, n) = last.take().unwrap();\n                        acquired_locks.insert(lo, (w, n));\n                    }\n                }\n            }\n            if last.is_none() {\n                // TODO evaluate whether this is correct, as page_in performs\n                // cache maintenance internally if it over/undershoots due to\n                // concurrent modifications.\n                last =\n                    Some(self.page_in(key, self.cache.current_flush_epoch())?);\n            }\n        }\n\n        if let Some((lo, w, id)) = last.take() {\n            acquired_locks.insert(lo, (w, id));\n        }\n\n        // NB: add the flush epoch at the end of the lock acquisition\n        // process when all locks have been acquired, to avoid situations\n        // where a leaf is already dirty with an epoch \"from the future\".\n        let flush_epoch_guard = self.cache.check_into_flush_epoch();\n        let new_epoch = flush_epoch_guard.epoch();\n\n        // Flush any leaves that are dirty from a previous flush epoch\n        // before performing operations.\n        for (write, node) in acquired_locks.values_mut() {\n            let leaf = write.leaf.as_mut().unwrap();\n            if let Some(old_flush_epoch) = leaf.dirty_flush_epoch {\n                if old_flush_epoch == new_epoch {\n                    // no need to cooperatively flush\n                    continue;\n                }\n\n                assert!(old_flush_epoch < new_epoch);\n\n                log::trace!(\n                    \"cooperatively flushing {:?} with dirty {:?} after checking into {:?}\",\n                    node.object_id,\n                    old_flush_epoch,\n                    new_epoch\n                );\n\n                self.cooperatively_serialize_leaf(node.object_id, &mut *leaf);\n            }\n        }\n\n        let mut splits: Vec<(InlineArray, Object<LEAF_FANOUT>)> = vec![];\n        let mut merges: BTreeMap<InlineArray, Object<LEAF_FANOUT>> =\n            BTreeMap::new();\n\n        // Insert and split when full\n        for (key, value_opt) in batch.writes {\n            let range = ..=&key;\n            let (lo, (w, object)) = acquired_locks\n                .range_mut::<InlineArray, _>(range)\n                .next_back()\n                .unwrap();\n            let leaf = w.leaf.as_mut().unwrap();\n\n            assert_eq!(lo, &leaf.lo);\n            assert!(leaf.lo <= key);\n            if let Some(hi) = &leaf.hi {\n                assert!(hi > &key);\n            }\n\n            if let Some(value) = value_opt {\n                leaf.insert(key, value);\n                merges.remove(lo);\n\n                merges.remove(&leaf.lo);\n\n                if let Some((split_key, rhs_node)) = leaf.split_if_full(\n                    new_epoch,\n                    &self.cache,\n                    self.collection_id,\n                ) {\n                    #[cfg(feature = \"for-internal-testing-only\")]\n                    let _b1 = track_blocks();\n\n                    let write = rhs_node.inner.write_arc();\n                    assert!(write.leaf.is_some());\n\n                    splits.push((split_key.clone(), rhs_node.clone()));\n                    acquired_locks.insert(split_key, (write, rhs_node));\n                }\n            } else {\n                leaf.remove(&key);\n\n                if leaf.is_empty() {\n                    assert_eq!(leaf.lo, lo);\n                    merges.insert(leaf.lo.clone(), object.clone());\n                }\n            }\n        }\n\n        // Make splits globally visible\n        for (split_key, rhs_node) in splits {\n            self.cache\n                .object_id_index\n                .insert(rhs_node.object_id, rhs_node.clone());\n            self.index.insert(split_key, rhs_node);\n        }\n\n        // Add all written leaves to dirty and prepare to mark cache accesses\n        let mut cache_accesses = Vec::with_capacity(acquired_locks.len());\n        for (low_key, (write, node)) in &mut acquired_locks {\n            let leaf = write.leaf.as_mut().unwrap();\n            leaf.set_dirty_epoch(new_epoch);\n            leaf.mutation_count += 1;\n            cache_accesses.push((node.object_id, leaf.in_memory_size));\n            self.cache.install_dirty(\n                new_epoch,\n                node.object_id,\n                Dirty::NotYetSerialized {\n                    collection_id: self.collection_id,\n                    node: node.clone(),\n                    low_key: low_key.clone(),\n                },\n            );\n\n            #[cfg(feature = \"for-internal-testing-only\")]\n            {\n                self.cache.event_verifier.mark(\n                    node.object_id,\n                    new_epoch,\n                    event_verifier::State::Dirty,\n                    concat!(file!(), ':', line!(), \":apply-batch\"),\n                );\n            }\n        }\n\n        // Drop locks\n        drop(acquired_locks);\n\n        // Perform cache maintenance\n        for (object_id, size) in cache_accesses {\n            self.cache.mark_access_and_evict(object_id, size, new_epoch)?;\n        }\n\n        Ok(())\n    }\n\n    /// Returns `true` if the `Tree` contains a value for\n    /// the specified key.\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// # let config = sled::Config::tmp().unwrap();\n    /// # let db: sled::Db<1024> = config.open()?;\n    /// db.insert(&[0], vec![0])?;\n    /// assert!(db.contains_key(&[0])?);\n    /// assert!(!db.contains_key(&[1])?);\n    /// # Ok(()) }\n    /// ```\n    pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> io::Result<bool> {\n        self.get(key).map(|v| v.is_some())\n    }\n\n    /// Retrieve the key and value before the provided key,\n    /// if one exists.\n    ///\n    /// # Note\n    /// The order follows the Ord implementation for `Vec<u8>`:\n    ///\n    /// `[] < [0] < [255] < [255, 0] < [255, 255] ...`\n    ///\n    /// To retain the ordering of numerical types use big endian reprensentation\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// use sled::InlineArray;\n    /// # let config = sled::Config::tmp().unwrap();\n    /// # let db: sled::Db<1024> = config.open()?;\n    /// for i in 0..10 {\n    ///     db.insert(&[i], vec![i])\n    ///         .expect(\"should write successfully\");\n    /// }\n    ///\n    /// assert!(db.get_lt(&[]).unwrap().is_none());\n    /// assert!(db.get_lt(&[0]).unwrap().is_none());\n    /// assert_eq!(\n    ///     db.get_lt(&[1]).unwrap(),\n    ///     Some((InlineArray::from(&[0]), InlineArray::from(&[0])))\n    /// );\n    /// assert_eq!(\n    ///     db.get_lt(&[9]).unwrap(),\n    ///     Some((InlineArray::from(&[8]), InlineArray::from(&[8])))\n    /// );\n    /// assert_eq!(\n    ///     db.get_lt(&[10]).unwrap(),\n    ///     Some((InlineArray::from(&[9]), InlineArray::from(&[9])))\n    /// );\n    /// assert_eq!(\n    ///     db.get_lt(&[255]).unwrap(),\n    ///     Some((InlineArray::from(&[9]), InlineArray::from(&[9])))\n    /// );\n    /// # Ok(()) }\n    /// ```\n    pub fn get_lt<K>(\n        &self,\n        key: K,\n    ) -> io::Result<Option<(InlineArray, InlineArray)>>\n    where\n        K: AsRef<[u8]>,\n    {\n        self.range(..key).next_back().transpose()\n    }\n\n    /// Retrieve the next key and value from the `Tree` after the\n    /// provided key.\n    ///\n    /// # Note\n    /// The order follows the Ord implementation for `Vec<u8>`:\n    ///\n    /// `[] < [0] < [255] < [255, 0] < [255, 255] ...`\n    ///\n    /// To retain the ordering of numerical types use big endian reprensentation\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// use sled::InlineArray;\n    /// # let config = sled::Config::tmp().unwrap();\n    /// # let db: sled::Db<1024> = config.open()?;\n    /// for i in 0..10 {\n    ///     db.insert(&[i], vec![i])?;\n    /// }\n    ///\n    /// assert_eq!(\n    ///     db.get_gt(&[]).unwrap(),\n    ///     Some((InlineArray::from(&[0]), InlineArray::from(&[0])))\n    /// );\n    /// assert_eq!(\n    ///     db.get_gt(&[0]).unwrap(),\n    ///     Some((InlineArray::from(&[1]), InlineArray::from(&[1])))\n    /// );\n    /// assert_eq!(\n    ///     db.get_gt(&[1]).unwrap(),\n    ///     Some((InlineArray::from(&[2]), InlineArray::from(&[2])))\n    /// );\n    /// assert_eq!(\n    ///     db.get_gt(&[8]).unwrap(),\n    ///     Some((InlineArray::from(&[9]), InlineArray::from(&[9])))\n    /// );\n    /// assert!(db.get_gt(&[9]).unwrap().is_none());\n    ///\n    /// db.insert(500u16.to_be_bytes(), vec![10]);\n    /// assert_eq!(\n    ///     db.get_gt(&499u16.to_be_bytes()).unwrap(),\n    ///     Some((InlineArray::from(&500u16.to_be_bytes()), InlineArray::from(&[10])))\n    /// );\n    /// # Ok(()) }\n    /// ```\n    pub fn get_gt<K>(\n        &self,\n        key: K,\n    ) -> io::Result<Option<(InlineArray, InlineArray)>>\n    where\n        K: AsRef<[u8]>,\n    {\n        self.range((ops::Bound::Excluded(key), ops::Bound::Unbounded))\n            .next()\n            .transpose()\n    }\n\n    /// Create an iterator over tuples of keys and values\n    /// where all keys start with the given prefix.\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// # let config = sled::Config::tmp().unwrap();\n    /// # let db: sled::Db<1024> = config.open()?;\n    /// use sled::InlineArray;\n    /// db.insert(&[0, 0, 0], vec![0, 0, 0])?;\n    /// db.insert(&[0, 0, 1], vec![0, 0, 1])?;\n    /// db.insert(&[0, 0, 2], vec![0, 0, 2])?;\n    /// db.insert(&[0, 0, 3], vec![0, 0, 3])?;\n    /// db.insert(&[0, 1, 0], vec![0, 1, 0])?;\n    /// db.insert(&[0, 1, 1], vec![0, 1, 1])?;\n    ///\n    /// let prefix: &[u8] = &[0, 0];\n    /// let mut r = db.scan_prefix(prefix);\n    /// assert_eq!(\n    ///     r.next().unwrap().unwrap(),\n    ///     (InlineArray::from(&[0, 0, 0]), InlineArray::from(&[0, 0, 0]))\n    /// );\n    /// assert_eq!(\n    ///     r.next().unwrap().unwrap(),\n    ///     (InlineArray::from(&[0, 0, 1]), InlineArray::from(&[0, 0, 1]))\n    /// );\n    /// assert_eq!(\n    ///     r.next().unwrap().unwrap(),\n    ///     (InlineArray::from(&[0, 0, 2]), InlineArray::from(&[0, 0, 2]))\n    /// );\n    /// assert_eq!(\n    ///     r.next().unwrap().unwrap(),\n    ///     (InlineArray::from(&[0, 0, 3]), InlineArray::from(&[0, 0, 3]))\n    /// );\n    /// assert!(r.next().is_none());\n    /// # Ok(()) }\n    /// ```\n    pub fn scan_prefix<P>(&self, prefix: P) -> Iter<LEAF_FANOUT>\n    where\n        P: AsRef<[u8]>,\n    {\n        let prefix_ref = prefix.as_ref();\n        let mut upper = prefix_ref.to_vec();\n\n        while let Some(last) = upper.pop() {\n            if last < u8::MAX {\n                upper.push(last + 1);\n                return self.range(prefix_ref..&upper);\n            }\n        }\n\n        self.range(prefix..)\n    }\n\n    /// Returns the first key and value in the `Tree`, or\n    /// `None` if the `Tree` is empty.\n    pub fn first(&self) -> io::Result<Option<(InlineArray, InlineArray)>> {\n        self.iter().next().transpose()\n    }\n\n    /// Returns the last key and value in the `Tree`, or\n    /// `None` if the `Tree` is empty.\n    pub fn last(&self) -> io::Result<Option<(InlineArray, InlineArray)>> {\n        self.iter().next_back().transpose()\n    }\n\n    /// Atomically removes the maximum item in the `Tree` instance.\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// # let config = sled::Config::tmp().unwrap();\n    /// # let db: sled::Db<1024> = config.open()?;\n    /// db.insert(&[0], vec![0])?;\n    /// db.insert(&[1], vec![10])?;\n    /// db.insert(&[2], vec![20])?;\n    /// db.insert(&[3], vec![30])?;\n    /// db.insert(&[4], vec![40])?;\n    /// db.insert(&[5], vec![50])?;\n    ///\n    /// assert_eq!(&db.pop_last()?.unwrap().0, &[5]);\n    /// assert_eq!(&db.pop_last()?.unwrap().0, &[4]);\n    /// assert_eq!(&db.pop_last()?.unwrap().0, &[3]);\n    /// assert_eq!(&db.pop_last()?.unwrap().0, &[2]);\n    /// assert_eq!(&db.pop_last()?.unwrap().0, &[1]);\n    /// assert_eq!(&db.pop_last()?.unwrap().0, &[0]);\n    /// assert_eq!(db.pop_last()?, None);\n    /// # Ok(()) }\n    /// ```\n    pub fn pop_last(&self) -> io::Result<Option<(InlineArray, InlineArray)>> {\n        loop {\n            if let Some(first_res) = self.iter().next_back() {\n                let first = first_res?;\n                if self\n                    .compare_and_swap::<_, _, &[u8]>(\n                        &first.0,\n                        Some(&first.1),\n                        None,\n                    )?\n                    .is_ok()\n                {\n                    log::trace!(\"pop_last removed item {:?}\", first);\n                    return Ok(Some(first));\n                }\n            // try again\n            } else {\n                log::trace!(\"pop_last removed nothing from empty tree\");\n                return Ok(None);\n            }\n        }\n    }\n\n    /// Pops the last kv pair in the provided range, or returns `Ok(None)` if nothing\n    /// exists within that range.\n    ///\n    /// # Panics\n    ///\n    /// This will panic if the provided range's end_bound() == Bound::Excluded(K::MIN).\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// # let config = sled::Config::tmp().unwrap();\n    /// # let db: sled::Db<1024> = config.open()?;\n    ///\n    /// let data = vec![\n    ///     (b\"key 1\", b\"value 1\"),\n    ///     (b\"key 2\", b\"value 2\"),\n    ///     (b\"key 3\", b\"value 3\")\n    /// ];\n    ///\n    /// for (k, v) in data {\n    ///     db.insert(k, v).unwrap();\n    /// }\n    ///\n    /// let r1 = db.pop_last_in_range(b\"key 1\".as_ref()..=b\"key 3\").unwrap();\n    /// assert_eq!(Some((b\"key 3\".into(), b\"value 3\".into())), r1);\n    ///\n    /// let r2 = db.pop_last_in_range(b\"key 1\".as_ref()..b\"key 3\").unwrap();\n    /// assert_eq!(Some((b\"key 2\".into(), b\"value 2\".into())), r2);\n    ///\n    /// let r3 = db.pop_last_in_range(b\"key 4\".as_ref()..).unwrap();\n    /// assert!(r3.is_none());\n    ///\n    /// let r4 = db.pop_last_in_range(b\"key 2\".as_ref()..=b\"key 3\").unwrap();\n    /// assert!(r4.is_none());\n    ///\n    /// let r5 = db.pop_last_in_range(b\"key 0\".as_ref()..=b\"key 3\").unwrap();\n    /// assert_eq!(Some((b\"key 1\".into(), b\"value 1\".into())), r5);\n    ///\n    /// let r6 = db.pop_last_in_range(b\"key 0\".as_ref()..=b\"key 3\").unwrap();\n    /// assert!(r6.is_none());\n    /// # Ok (()) }\n    /// ```\n    pub fn pop_last_in_range<K, R>(\n        &self,\n        range: R,\n    ) -> io::Result<Option<(InlineArray, InlineArray)>>\n    where\n        K: AsRef<[u8]>,\n        R: Clone + RangeBounds<K>,\n    {\n        loop {\n            let mut r = self.range(range.clone());\n            let (k, v) = if let Some(kv_res) = r.next_back() {\n                kv_res?\n            } else {\n                return Ok(None);\n            };\n            if self\n                .compare_and_swap(&k, Some(&v), None as Option<InlineArray>)?\n                .is_ok()\n            {\n                return Ok(Some((k, v)));\n            }\n        }\n    }\n\n    /// Atomically removes the minimum item in the `Tree` instance.\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// # let config = sled::Config::tmp().unwrap();\n    /// # let db: sled::Db<1024> = config.open()?;\n    /// db.insert(&[0], vec![0])?;\n    /// db.insert(&[1], vec![10])?;\n    /// db.insert(&[2], vec![20])?;\n    /// db.insert(&[3], vec![30])?;\n    /// db.insert(&[4], vec![40])?;\n    /// db.insert(&[5], vec![50])?;\n    ///\n    /// assert_eq!(&db.pop_first()?.unwrap().0, &[0]);\n    /// assert_eq!(&db.pop_first()?.unwrap().0, &[1]);\n    /// assert_eq!(&db.pop_first()?.unwrap().0, &[2]);\n    /// assert_eq!(&db.pop_first()?.unwrap().0, &[3]);\n    /// assert_eq!(&db.pop_first()?.unwrap().0, &[4]);\n    /// assert_eq!(&db.pop_first()?.unwrap().0, &[5]);\n    /// assert_eq!(db.pop_first()?, None);\n    /// # Ok(()) }\n    /// ```\n    pub fn pop_first(&self) -> io::Result<Option<(InlineArray, InlineArray)>> {\n        loop {\n            if let Some(first_res) = self.iter().next() {\n                let first = first_res?;\n                if self\n                    .compare_and_swap::<_, _, &[u8]>(\n                        &first.0,\n                        Some(&first.1),\n                        None,\n                    )?\n                    .is_ok()\n                {\n                    log::trace!(\"pop_first removed item {:?}\", first);\n                    return Ok(Some(first));\n                }\n            // try again\n            } else {\n                log::trace!(\"pop_first removed nothing from empty tree\");\n                return Ok(None);\n            }\n        }\n    }\n\n    /// Pops the first kv pair in the provided range, or returns `Ok(None)` if nothing\n    /// exists within that range.\n    ///\n    /// # Panics\n    ///\n    /// This will panic if the provided range's end_bound() == Bound::Excluded(K::MIN).\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// # let config = sled::Config::tmp().unwrap();\n    /// # let db: sled::Db<1024> = config.open()?;\n    ///\n    /// let data = vec![\n    ///     (b\"key 1\", b\"value 1\"),\n    ///     (b\"key 2\", b\"value 2\"),\n    ///     (b\"key 3\", b\"value 3\")\n    /// ];\n    ///\n    /// for (k, v) in data {\n    ///     db.insert(k, v).unwrap();\n    /// }\n    ///\n    /// let r1 = db.pop_first_in_range(\"key 1\".as_ref()..=\"key 3\").unwrap();\n    /// assert_eq!(Some((b\"key 1\".into(), b\"value 1\".into())), r1);\n    ///\n    /// let r2 = db.pop_first_in_range(\"key 1\".as_ref()..\"key 3\").unwrap();\n    /// assert_eq!(Some((b\"key 2\".into(), b\"value 2\".into())), r2);\n    ///\n    /// let r3_res: std::io::Result<Vec<_>> = db.range(b\"key 4\".as_ref()..).collect();\n    /// let r3: Vec<_> = r3_res.unwrap();\n    /// assert!(r3.is_empty());\n    ///\n    /// let r4 = db.pop_first_in_range(\"key 2\".as_ref()..=\"key 3\").unwrap();\n    /// assert_eq!(Some((b\"key 3\".into(), b\"value 3\".into())), r4);\n    /// # Ok (()) }\n    /// ```\n    pub fn pop_first_in_range<K, R>(\n        &self,\n        range: R,\n    ) -> io::Result<Option<(InlineArray, InlineArray)>>\n    where\n        K: AsRef<[u8]>,\n        R: Clone + RangeBounds<K>,\n    {\n        loop {\n            let mut r = self.range(range.clone());\n            let (k, v) = if let Some(kv_res) = r.next() {\n                kv_res?\n            } else {\n                return Ok(None);\n            };\n            if self\n                .compare_and_swap(&k, Some(&v), None as Option<InlineArray>)?\n                .is_ok()\n            {\n                return Ok(Some((k, v)));\n            }\n        }\n    }\n\n    /// Returns the number of elements in this tree.\n    ///\n    /// Beware: performs a full O(n) scan under the hood.\n    ///\n    /// # Examples\n    ///\n    /// ```\n    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n    /// # let config = sled::Config::tmp().unwrap();\n    /// # let db: sled::Db<1024> = config.open()?;\n    /// db.insert(b\"a\", vec![0]);\n    /// db.insert(b\"b\", vec![1]);\n    /// assert_eq!(db.len().unwrap(), 2);\n    /// # Ok(()) }\n    /// ```\n    pub fn len(&self) -> io::Result<usize> {\n        let mut count = 0;\n        for item_res in self.iter() {\n            let _item = item_res?;\n            count += 1;\n        }\n        Ok(count)\n    }\n\n    /// Returns `true` if the `Tree` contains no elements.\n    ///\n    /// This is O(1), as we only need to see if an iterator\n    /// returns anything for the first call to `next()`.\n    pub fn is_empty(&self) -> io::Result<bool> {\n        if let Some(res) = self.iter().next() {\n            res?;\n            Ok(false)\n        } else {\n            Ok(true)\n        }\n    }\n\n    /// Clears the `Tree`, removing all values.\n    ///\n    /// Note that this is not atomic.\n    ///\n    /// Beware: performs a full O(n) scan under the hood.\n    pub fn clear(&self) -> io::Result<()> {\n        for k in self.iter().keys() {\n            let key = k?;\n            let _old = self.remove(key)?;\n        }\n        Ok(())\n    }\n\n    /// Returns the CRC32 of all keys and values\n    /// in this Tree.\n    ///\n    /// This is O(N) and locks the underlying tree\n    /// for the duration of the entire scan.\n    pub fn checksum(&self) -> io::Result<u32> {\n        let mut hasher = crc32fast::Hasher::new();\n        for kv_res in self.iter() {\n            let (k, v) = kv_res?;\n            hasher.update(&k);\n            hasher.update(&v);\n        }\n        Ok(hasher.finalize())\n    }\n}\n\n#[allow(unused)]\npub struct Iter<const LEAF_FANOUT: usize> {\n    inner: Tree<LEAF_FANOUT>,\n    bounds: (Bound<InlineArray>, Bound<InlineArray>),\n    next_calls: usize,\n    next_back_calls: usize,\n    next_fetch: Option<InlineArray>,\n    next_back_last_lo: Option<InlineArray>,\n    prefetched: VecDeque<(InlineArray, InlineArray)>,\n    prefetched_back: VecDeque<(InlineArray, InlineArray)>,\n}\n\nimpl<const LEAF_FANOUT: usize> Iterator for Iter<LEAF_FANOUT> {\n    type Item = io::Result<(InlineArray, InlineArray)>;\n\n    fn next(&mut self) -> Option<Self::Item> {\n        self.next_calls += 1;\n        while self.prefetched.is_empty() {\n            let search_key = if let Some(last) = &self.next_fetch {\n                last.clone()\n            } else {\n                return None;\n            };\n\n            let node = match self.inner.leaf_for_key(&search_key) {\n                Ok(n) => n,\n                Err(e) => return Some(Err(e)),\n            };\n\n            let leaf = node.leaf_read.leaf.as_ref().unwrap();\n\n            if let Some(leaf_hi) = &leaf.hi {\n                if leaf_hi <= &search_key {\n                    // concurrent merge, retry\n                    log::trace!(\"undershot in interator, retrying search\");\n                    continue;\n                }\n            }\n\n            if leaf.lo > search_key {\n                // concurrent successor split, retry\n                log::trace!(\"overshot in interator, retrying search\");\n                continue;\n            }\n\n            for (k, v) in leaf.iter() {\n                if self.bounds.contains(&k) && search_key <= k {\n                    self.prefetched.push_back((k.clone(), v.clone()));\n                }\n            }\n\n            self.next_fetch = leaf.hi.clone();\n        }\n\n        self.prefetched.pop_front().map(Ok)\n    }\n}\n\nimpl<const LEAF_FANOUT: usize> DoubleEndedIterator for Iter<LEAF_FANOUT> {\n    fn next_back(&mut self) -> Option<Self::Item> {\n        self.next_back_calls += 1;\n        while self.prefetched_back.is_empty() {\n            let search_key: InlineArray = if let Some(last) =\n                &self.next_back_last_lo\n            {\n                if !self.bounds.contains(last) || last == &InlineArray::MIN {\n                    return None;\n                }\n                self.inner\n                    .index\n                    .range::<InlineArray, _>(..last)\n                    .next_back()\n                    .unwrap()\n                    .0\n            } else {\n                match &self.bounds.1 {\n                    Bound::Included(k) => k.clone(),\n                    Bound::Excluded(k) if k == &InlineArray::MIN => {\n                        InlineArray::MIN\n                    }\n                    Bound::Excluded(k) => self.inner.index.get_lt(k).unwrap().0,\n                    Bound::Unbounded => self.inner.index.last().unwrap().0,\n                }\n            };\n\n            let node = match self.inner.leaf_for_key(&search_key) {\n                Ok(n) => n,\n                Err(e) => return Some(Err(e)),\n            };\n\n            let leaf = node.leaf_read.leaf.as_ref().unwrap();\n\n            if leaf.lo > search_key {\n                // concurrent successor split, retry\n                log::trace!(\"overshot in reverse interator, retrying search\");\n                continue;\n            }\n\n            // determine if we undershot our target due to concurrent modifications\n            let undershot =\n                match (&leaf.hi, &self.next_back_last_lo, &self.bounds.1) {\n                    (Some(leaf_hi), Some(last_lo), _) => leaf_hi < last_lo,\n                    (Some(_leaf_hi), None, Bound::Unbounded) => true,\n                    (Some(leaf_hi), None, Bound::Included(bound_key)) => {\n                        leaf_hi <= bound_key\n                    }\n                    (Some(leaf_hi), None, Bound::Excluded(bound_key)) => {\n                        leaf_hi < bound_key\n                    }\n                    (None, _, _) => false,\n                };\n\n            if undershot {\n                log::trace!(\n                    \"undershoot detected in reverse iterator with \\\n                    (leaf_hi, next_back_last_lo, self.bounds.1) being {:?}\",\n                    (&leaf.hi, &self.next_back_last_lo, &self.bounds.1)\n                );\n                continue;\n            }\n\n            for (k, v) in leaf.iter() {\n                if self.bounds.contains(&k) {\n                    let beneath_last_lo =\n                        if let Some(last_lo) = &self.next_back_last_lo {\n                            &k < last_lo\n                        } else {\n                            true\n                        };\n                    if beneath_last_lo {\n                        self.prefetched_back.push_back((k.clone(), v.clone()));\n                    }\n                }\n            }\n            self.next_back_last_lo = Some(leaf.lo.clone());\n        }\n\n        self.prefetched_back.pop_back().map(Ok)\n    }\n}\n\nimpl<const LEAF_FANOUT: usize> Iter<LEAF_FANOUT> {\n    pub fn keys(\n        self,\n    ) -> impl DoubleEndedIterator<Item = io::Result<InlineArray>> {\n        self.into_iter().map(|kv_res| kv_res.map(|(k, _v)| k))\n    }\n\n    pub fn values(\n        self,\n    ) -> impl DoubleEndedIterator<Item = io::Result<InlineArray>> {\n        self.into_iter().map(|kv_res| kv_res.map(|(_k, v)| v))\n    }\n}\n\nimpl<const LEAF_FANOUT: usize> IntoIterator for &Tree<LEAF_FANOUT> {\n    type Item = io::Result<(InlineArray, InlineArray)>;\n    type IntoIter = Iter<LEAF_FANOUT>;\n\n    fn into_iter(self) -> Self::IntoIter {\n        self.iter()\n    }\n}\n\n/// A batch of updates that will\n/// be applied atomically to the\n/// Tree.\n///\n/// # Examples\n///\n/// ```\n/// # fn main() -> Result<(), Box<dyn std::error::Error>> {\n/// use sled::{Batch, open};\n///\n/// # let _ = std::fs::remove_dir_all(\"batch_db_2\");\n/// let db: sled::Db<1024> = open(\"batch_db_2\")?;\n/// db.insert(\"key_0\", \"val_0\")?;\n///\n/// let mut batch = Batch::default();\n/// batch.insert(\"key_a\", \"val_a\");\n/// batch.insert(\"key_b\", \"val_b\");\n/// batch.insert(\"key_c\", \"val_c\");\n/// batch.remove(\"key_0\");\n///\n/// db.apply_batch(batch)?;\n/// // key_0 no longer exists, and key_a, key_b, and key_c\n/// // now do exist.\n/// # let _ = std::fs::remove_dir_all(\"batch_db_2\");\n/// # Ok(()) }\n/// ```\n#[derive(Debug, Default, Clone, PartialEq, Eq)]\npub struct Batch {\n    pub(crate) writes:\n        std::collections::BTreeMap<InlineArray, Option<InlineArray>>,\n}\n\nimpl Batch {\n    /// Set a key to a new value\n    pub fn insert<K, V>(&mut self, key: K, value: V)\n    where\n        K: Into<InlineArray>,\n        V: Into<InlineArray>,\n    {\n        self.writes.insert(key.into(), Some(value.into()));\n    }\n\n    /// Remove a key\n    pub fn remove<K>(&mut self, key: K)\n    where\n        K: Into<InlineArray>,\n    {\n        self.writes.insert(key.into(), None);\n    }\n\n    /// Get a value if it is present in the `Batch`.\n    /// `Some(None)` means it's present as a deletion.\n    pub fn get<K: AsRef<[u8]>>(&self, k: K) -> Option<Option<&InlineArray>> {\n        let inner = self.writes.get(k.as_ref())?;\n        Some(inner.as_ref())\n    }\n}\n"
  },
  {
    "path": "tests/00_regression.rs",
    "content": "mod common;\nmod tree;\n\nuse std::alloc::{Layout, System};\n\nuse tree::{Key, Op::*, prop_tree_matches_btreemap};\n\n#[global_allocator]\nstatic ALLOCATOR: ShredAllocator = ShredAllocator;\n\n#[derive(Default, Debug, Clone, Copy)]\nstruct ShredAllocator;\n\nunsafe impl std::alloc::GlobalAlloc for ShredAllocator {\n    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {\n        unsafe {\n            assert!(layout.size() < 1_000_000_000);\n            let ret = System.alloc(layout);\n            assert_ne!(ret, std::ptr::null_mut());\n            std::ptr::write_bytes(ret, 0xa1, layout.size());\n            ret\n        }\n    }\n\n    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {\n        unsafe {\n            std::ptr::write_bytes(ptr, 0xde, layout.size());\n            System.dealloc(ptr, layout)\n        }\n    }\n}\n\n#[allow(dead_code)]\nconst INTENSITY: usize = 10;\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_00() {\n    // postmortem:\n    prop_tree_matches_btreemap(vec![Restart], false, 0, 256);\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_01() {\n    // postmortem:\n    // this was a bug in the snapshot recovery, where\n    // it led to max_id dropping by 1 after a restart.\n    // postmortem 2:\n    // we were stalling here because we had a new log with stable of\n    // SEG_HEADER_LEN, but when we iterated over it to create a new\n    // snapshot (snapshot every 1 set in Config), we iterated up until\n    // that offset. make_stable requires our stable offset to be >=\n    // the provided one, to deal with 0.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![32]), 9),\n            Set(Key(vec![195]), 13),\n            Restart,\n            Set(Key(vec![164]), 147),\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_02() {\n    // postmortem:\n    // this was a bug in the way that the `Materializer`\n    // was fed data, possibly out of order, if recover\n    // in the pagecache had to run over log entries\n    // that were later run through the same `Materializer`\n    // then the second time (triggered by a snapshot)\n    // would not pick up on the importance of seeing\n    // the new root set.\n    // portmortem 2: when refactoring iterators, failed\n    // to account for node.hi being empty on the infinity\n    // shard\n    prop_tree_matches_btreemap(\n        vec![\n            Restart,\n            Set(Key(vec![215]), 121),\n            Restart,\n            Set(Key(vec![216]), 203),\n            Scan(Key(vec![210]), 4),\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_03() {\n    // postmortem: the tree was not persisting and recovering root hoists\n    // postmortem 2: when refactoring the log storage, we failed to restart\n    // log writing in the proper location.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![113]), 204),\n            Set(Key(vec![119]), 205),\n            Set(Key(vec![166]), 88),\n            Set(Key(vec![23]), 44),\n            Restart,\n            Set(Key(vec![226]), 192),\n            Set(Key(vec![189]), 186),\n            Restart,\n            Scan(Key(vec![198]), 11),\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_04() {\n    // postmortem: pagecache was failing to replace the LogId list\n    // when it encountered a new Update::Compact.\n    // postmortem 2: after refactoring log storage, we were not properly\n    // setting the log tip, and the beginning got clobbered after writing\n    // after a restart.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![158]), 31),\n            Set(Key(vec![111]), 134),\n            Set(Key(vec![230]), 187),\n            Set(Key(vec![169]), 58),\n            Set(Key(vec![131]), 10),\n            Set(Key(vec![108]), 246),\n            Set(Key(vec![127]), 155),\n            Restart,\n            Set(Key(vec![59]), 119),\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_05() {\n    // postmortem: during recovery, the segment accountant was failing to\n    // properly set the file's tip.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![231]), 107),\n            Set(Key(vec![251]), 42),\n            Set(Key(vec![80]), 81),\n            Set(Key(vec![178]), 130),\n            Set(Key(vec![150]), 232),\n            Restart,\n            Set(Key(vec![98]), 78),\n            Set(Key(vec![0]), 45),\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_06() {\n    // postmortem: after reusing segments, we were failing to checksum reads\n    // performed while iterating over rewritten segment buffers, and using\n    // former garbage data. fix: use the crc that's there for catching torn\n    // writes with high probability, AND zero out buffers.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![162]), 8),\n            Set(Key(vec![59]), 192),\n            Set(Key(vec![238]), 83),\n            Set(Key(vec![151]), 231),\n            Restart,\n            Set(Key(vec![30]), 206),\n            Set(Key(vec![150]), 146),\n            Set(Key(vec![18]), 34),\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_07() {\n    // postmortem: the segment accountant was not fully recovered, and thought\n    // that it could reuse a particular segment that wasn't actually empty\n    // yet.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![135]), 22),\n            Set(Key(vec![41]), 36),\n            Set(Key(vec![101]), 31),\n            Set(Key(vec![111]), 35),\n            Restart,\n            Set(Key(vec![47]), 36),\n            Set(Key(vec![79]), 114),\n            Set(Key(vec![64]), 9),\n            Scan(Key(vec![196]), 25),\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_08() {\n    // postmortem: failed to properly recover the state in the segment\n    // accountant that tracked the previously issued segment.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![145]), 151),\n            Set(Key(vec![155]), 148),\n            Set(Key(vec![131]), 170),\n            Set(Key(vec![163]), 60),\n            Set(Key(vec![225]), 126),\n            Restart,\n            Set(Key(vec![64]), 237),\n            Set(Key(vec![102]), 205),\n            Restart,\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_09() {\n    // postmortem 1: was failing to load existing snapshots on initialization.\n    // would encounter uninitialized segments at the log tip and overwrite\n    // the first segment (indexed by LSN of 0) in the segment accountant\n    // ordering, skipping over important updates.\n    //\n    // postmortem 2: page size tracking was inconsistent in SA. completely\n    // removed exact size tracking, and went back to simpler pure-page\n    // tenancy model.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![189]), 36),\n            Set(Key(vec![254]), 194),\n            Set(Key(vec![132]), 50),\n            Set(Key(vec![91]), 221),\n            Set(Key(vec![126]), 6),\n            Set(Key(vec![199]), 183),\n            Set(Key(vec![71]), 125),\n            Scan(Key(vec![67]), 16),\n            Set(Key(vec![190]), 16),\n            Restart,\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_10() {\n    // postmortem: after reusing a segment, but not completely writing a\n    // segment, we were hitting an old LSN and violating an assert, rather\n    // than just ending.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![152]), 163),\n            Set(Key(vec![105]), 191),\n            Set(Key(vec![207]), 217),\n            Set(Key(vec![128]), 19),\n            Set(Key(vec![106]), 22),\n            Scan(Key(vec![20]), 24),\n            Set(Key(vec![14]), 150),\n            Set(Key(vec![80]), 43),\n            Set(Key(vec![174]), 134),\n            Set(Key(vec![20]), 150),\n            Set(Key(vec![13]), 171),\n            Restart,\n            Scan(Key(vec![240]), 25),\n            Scan(Key(vec![77]), 37),\n            Set(Key(vec![153]), 232),\n            Del(Key(vec![2])),\n            Set(Key(vec![227]), 169),\n            Get(Key(vec![232])),\n            Cas(Key(vec![247]), 151, 70),\n            Set(Key(vec![78]), 52),\n            Get(Key(vec![16])),\n            Del(Key(vec![78])),\n            Cas(Key(vec![201]), 93, 196),\n            Set(Key(vec![172]), 84),\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_11() {\n    // postmortem: a stall was happening because LSNs and LogIds were being\n    // conflated in calls to make_stable. A higher LogId than any LSN was\n    // being created, then passed in.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![38]), 148),\n            Set(Key(vec![176]), 175),\n            Set(Key(vec![82]), 88),\n            Set(Key(vec![164]), 85),\n            Set(Key(vec![139]), 74),\n            Set(Key(vec![73]), 23),\n            Cas(Key(vec![34]), 67, 151),\n            Set(Key(vec![115]), 133),\n            Set(Key(vec![249]), 138),\n            Restart,\n            Set(Key(vec![243]), 6),\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_12() {\n    // postmortem: was not checking that a log entry's LSN matches its position\n    // as part of detecting tears / partial rewrites.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![118]), 156),\n            Set(Key(vec![8]), 63),\n            Set(Key(vec![165]), 110),\n            Set(Key(vec![219]), 108),\n            Set(Key(vec![91]), 61),\n            Set(Key(vec![18]), 98),\n            Scan(Key(vec![73]), 6),\n            Set(Key(vec![240]), 108),\n            Cas(Key(vec![71]), 28, 189),\n            Del(Key(vec![199])),\n            Restart,\n            Set(Key(vec![30]), 140),\n            Scan(Key(vec![118]), 13),\n            Get(Key(vec![180])),\n            Cas(Key(vec![115]), 151, 116),\n            Restart,\n            Set(Key(vec![31]), 95),\n            Cas(Key(vec![79]), 153, 225),\n            Set(Key(vec![34]), 161),\n            Get(Key(vec![213])),\n            Set(Key(vec![237]), 215),\n            Del(Key(vec![52])),\n            Set(Key(vec![56]), 78),\n            Scan(Key(vec![141]), 2),\n            Cas(Key(vec![228]), 114, 170),\n            Get(Key(vec![231])),\n            Get(Key(vec![223])),\n            Del(Key(vec![167])),\n            Restart,\n            Scan(Key(vec![240]), 31),\n            Del(Key(vec![54])),\n            Del(Key(vec![2])),\n            Set(Key(vec![117]), 165),\n            Set(Key(vec![223]), 50),\n            Scan(Key(vec![69]), 4),\n            Get(Key(vec![156])),\n            Set(Key(vec![214]), 72),\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_13() {\n    // postmortem: failed root hoists were being improperly recovered before the\n    // following free was done on their page, but we treated the written node as\n    // if it were a successful completed root hoist.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![42]), 10),\n            Set(Key(vec![137]), 220),\n            Set(Key(vec![183]), 129),\n            Set(Key(vec![91]), 145),\n            Set(Key(vec![126]), 26),\n            Set(Key(vec![255]), 67),\n            Set(Key(vec![69]), 18),\n            Restart,\n            Set(Key(vec![24]), 92),\n            Set(Key(vec![193]), 17),\n            Set(Key(vec![3]), 143),\n            Cas(Key(vec![50]), 13, 84),\n            Restart,\n            Set(Key(vec![191]), 116),\n            Restart,\n            Del(Key(vec![165])),\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_14() {\n    // postmortem: after adding prefix compression, we were not\n    // handling re-inserts and deletions properly\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![107]), 234),\n            Set(Key(vec![7]), 245),\n            Set(Key(vec![40]), 77),\n            Set(Key(vec![171]), 244),\n            Set(Key(vec![173]), 16),\n            Set(Key(vec![171]), 176),\n            Scan(Key(vec![93]), 33),\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_15() {\n    // postmortem: was not sorting keys properly when binary searching for them\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![102]), 165),\n            Set(Key(vec![91]), 191),\n            Set(Key(vec![141]), 228),\n            Set(Key(vec![188]), 124),\n            Del(Key(vec![141])),\n            Scan(Key(vec![101]), 26),\n        ],\n        true,\n        0,\n        256,\n    );\n}\n\n/*\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_16() {\n    // postmortem: the test merge function was not properly adding numbers.\n    prop_tree_matches_btreemap(\n        vec![Merge(Key(vec![247]), 162), Scan(Key(vec![209]), 31)],\n        false,\n        0,\n256\n    );\n}\n*/\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_17() {\n    // postmortem: we were creating a copy of a node leaf during iteration\n    // before accidentally putting it into a PinnedValue, despite the\n    // fact that it was not actually part of the node's actual memory!\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![194, 215, 103, 0, 138, 11, 248, 131]), 70),\n            Scan(Key(vec![]), 30),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_18() {\n    // postmortem: when implementing get_gt and get_lt, there were some\n    // issues with getting order comparisons correct.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![]), 19),\n            Set(Key(vec![78]), 98),\n            Set(Key(vec![255]), 224),\n            Set(Key(vec![]), 131),\n            Get(Key(vec![255])),\n            GetGt(Key(vec![89])),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_19() {\n    // postmortem: we were not seeking properly to the next node\n    // when we hit a half-split child and were using get_lt\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![]), 138),\n            Set(Key(vec![68]), 113),\n            Set(Key(vec![155]), 73),\n            Set(Key(vec![50]), 220),\n            Set(Key(vec![]), 247),\n            GetLt(Key(vec![100])),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_20() {\n    // postmortem: we were not seeking forward during get_gt\n    // if path_for_key reached a leaf that didn't include\n    // a key for our\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![]), 10),\n            Set(Key(vec![56]), 42),\n            Set(Key(vec![138]), 27),\n            Set(Key(vec![155]), 73),\n            Set(Key(vec![]), 251),\n            GetGt(Key(vec![94])),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_21() {\n    // postmortem: more split woes while implementing get_lt\n    // postmortem 2: failed to properly account for node hi key\n    // being empty in the view predecessor function\n    // postmortem 3: when rewriting Iter, failed to account for\n    // direction of iteration\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![176]), 163),\n            Set(Key(vec![]), 229),\n            Set(Key(vec![169]), 121),\n            Set(Key(vec![]), 58),\n            GetLt(Key(vec![176])),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n/*\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_22() {\n    // postmortem: inclusivity wasn't being properly flipped off after\n    // the first result during iteration\n    // postmortem 2: failed to properly check bounds while iterating\n    prop_tree_matches_btreemap(\n        vec![\n            Merge(Key(vec![]), 155),\n            Merge(Key(vec![56]), 251),\n            Scan(Key(vec![]), 2),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n*/\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_23() {\n    // postmortem: when rewriting CRC handling code, mis-sized the blob crc\n    prop_tree_matches_btreemap(\n        vec![Set(Key(vec![6; 5120]), 92), Restart, Scan(Key(vec![]), 35)],\n        false,\n        0,\n        256,\n    );\n}\n\n/*\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_24() {\n    // postmortem: get_gt diverged with the Iter impl\n    prop_tree_matches_btreemap(\n        vec![\n            Merge(Key(vec![]), 193),\n            Del(Key(vec![])),\n            Del(Key(vec![])),\n            Set(Key(vec![]), 55),\n            Set(Key(vec![]), 212),\n            Merge(Key(vec![]), 236),\n            Del(Key(vec![])),\n            Set(Key(vec![]), 192),\n            Del(Key(vec![])),\n            Set(Key(vec![94]), 115),\n            Merge(Key(vec![62]), 34),\n            GetGt(Key(vec![])),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_25() {\n    // postmortem: was not accounting for merges when traversing\n    // the frag chain and a Del was encountered\n    prop_tree_matches_btreemap(\n        vec![Del(Key(vec![])), Merge(Key(vec![]), 84), Get(Key(vec![]))],\n        false,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_26() {\n    // postmortem:\n    prop_tree_matches_btreemap(\n        vec![\n            Merge(Key(vec![]), 194),\n            Merge(Key(vec![62]), 114),\n            Merge(Key(vec![80]), 202),\n            Merge(Key(vec![]), 169),\n            Set(Key(vec![]), 197),\n            Del(Key(vec![])),\n            Del(Key(vec![])),\n            Set(Key(vec![]), 215),\n            Set(Key(vec![]), 164),\n            Merge(Key(vec![]), 150),\n            GetGt(Key(vec![])),\n            GetLt(Key(vec![80])),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_27() {\n    // postmortem: was not accounting for the fact that deletions reduce the\n    // chances of being able to split successfully.\n    prop_tree_matches_btreemap(\n        vec![\n            Del(Key(vec![])),\n            Merge(\n                Key(vec![\n                    74, 117, 68, 37, 89, 16, 84, 130, 133, 78, 74, 59, 44, 109,\n                    34, 5, 36, 74, 131, 100, 79, 86, 87, 107, 87, 27, 1, 85,\n                    53, 112, 89, 75, 67, 78, 58, 121, 0, 105, 8, 117, 79, 40,\n                    94, 123, 83, 72, 78, 23, 23, 35, 50, 77, 59, 75, 54, 92,\n                    89, 12, 27, 48, 64, 21, 42, 97, 45, 28, 122, 13, 4, 32, 51,\n                    25, 26, 18, 65, 12, 54, 104, 106, 80, 75, 91, 111, 9, 5,\n                    130, 43, 40, 3, 72, 0, 58, 92, 64, 112, 97, 75, 130, 11,\n                    135, 19, 107, 40, 17, 25, 49, 48, 119, 82, 54, 35, 113, 91,\n                    68, 12, 118, 123, 62, 108, 88, 67, 43, 33, 119, 132, 124,\n                    1, 62, 133, 110, 25, 62, 129, 117, 117, 107, 123, 94, 127,\n                    80, 0, 116, 101, 9, 9, 54, 134, 70, 66, 79, 50, 124, 115,\n                    85, 42, 120, 24, 15, 81, 100, 72, 71, 40, 58, 22, 6, 34,\n                    54, 69, 110, 18, 74, 111, 80, 52, 90, 44, 4, 29, 84, 95,\n                    21, 25, 10, 10, 60, 18, 78, 23, 21, 114, 92, 96, 17, 127,\n                    53, 86, 2, 60, 104, 8, 132, 44, 115, 6, 25, 80, 46, 12, 20,\n                    44, 67, 136, 127, 50, 55, 70, 41, 90, 16, 10, 44, 32, 24,\n                    106, 13, 104,\n                ]),\n                219,\n            ),\n            Merge(Key(vec![]), 71),\n            Del(Key(vec![])),\n            Set(Key(vec![0]), 146),\n            Merge(Key(vec![13]), 155),\n            Merge(Key(vec![]), 14),\n            Del(Key(vec![])),\n            Set(Key(vec![]), 150),\n            Set(\n                Key(vec![\n                    13, 8, 3, 6, 9, 14, 3, 13, 7, 12, 13, 7, 13, 13, 1, 13, 5,\n                    4, 3, 2, 6, 16, 17, 10, 0, 16, 12, 0, 16, 1, 0, 15, 15, 4,\n                    1, 6, 9, 9, 11, 16, 7, 6, 10, 1, 11, 10, 4, 9, 9, 14, 4,\n                    12, 16, 10, 15, 2, 1, 8, 4,\n                ]),\n                247,\n            ),\n            Del(Key(vec![154])),\n            Del(Key(vec![])),\n            Del(Key(vec![\n                0, 24, 24, 31, 40, 23, 10, 30, 16, 41, 30, 23, 14, 25, 21, 19,\n                18, 7, 17, 41, 11, 5, 14, 42, 11, 22, 4, 8, 4, 38, 33, 31, 3,\n                30, 40, 22, 40, 39, 5, 40, 1, 41, 11, 26, 25, 33, 12, 38, 4,\n                35, 30, 42, 19, 26, 23, 22, 39, 18, 29, 4, 1, 24, 14, 38, 0,\n                36, 27, 11, 27, 34, 16, 15, 38, 0, 20, 37, 22, 31, 12, 26, 16,\n                4, 22, 25, 4, 34, 4, 33, 37, 28, 18, 4, 41, 15, 8, 16, 27, 3,\n                20, 26, 40, 31, 15, 15, 17, 15, 5, 13, 22, 37, 7, 13, 35, 14,\n                6, 28, 21, 26, 13, 35, 1, 10, 8, 34, 23, 27, 29, 8, 14, 42, 36,\n                31, 34, 12, 31, 24, 5, 8, 11, 36, 29, 24, 38, 8, 12, 18, 22,\n                36, 21, 28, 11, 24, 0, 41, 37, 39, 42, 25, 13, 41, 27, 8, 24,\n                22, 30, 17, 2, 4, 20, 33, 5, 24, 33, 6, 29, 5, 0, 17, 9, 20,\n                26, 15, 23, 22, 16, 23, 16, 1, 20, 0, 28, 16, 34, 30, 19, 5,\n                36, 40, 28, 6, 39,\n            ])),\n            Merge(Key(vec![]), 50),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_28() {\n    // postmortem:\n    prop_tree_matches_btreemap(\n        vec![\n            Del(Key(vec![])),\n            Set(Key(vec![]), 65),\n            Del(Key(vec![])),\n            Del(Key(vec![])),\n            Merge(Key(vec![]), 50),\n            Merge(Key(vec![]), 2),\n            Del(Key(vec![197])),\n            Merge(Key(vec![5]), 146),\n            Set(Key(vec![222]), 224),\n            Merge(Key(vec![149]), 60),\n            Scan(Key(vec![178]), 18),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_29() {\n    // postmortem: tree merge and split thresholds caused an infinite\n    // loop while performing updates\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![]), 142),\n            Merge(\n                Key(vec![\n                    45, 47, 6, 67, 16, 12, 62, 35, 69, 80, 49, 61, 29, 82, 9,\n                    47, 25, 78, 47, 64, 29, 74, 45, 0, 37, 44, 21, 82, 55, 44,\n                    31, 60, 86, 18, 45, 67, 55, 21, 35, 46, 25, 51, 5, 32, 33,\n                    36, 1, 81, 28, 28, 79, 76, 80, 89, 80, 62, 8, 85, 50, 15,\n                    4, 11, 76, 72, 73, 47, 30, 50, 85, 67, 84, 13, 82, 84, 78,\n                    70, 42, 83, 8, 7, 50, 77, 85, 37, 47, 82, 86, 46, 30, 27,\n                    5, 39, 70, 26, 59, 16, 6, 34, 56, 40, 40, 67, 16, 61, 63,\n                    56, 64, 31, 15, 81, 84, 19, 61, 66, 3, 7, 40, 56, 13, 40,\n                    64, 50, 88, 47, 88, 50, 63, 65, 79, 62, 1, 44, 59, 27, 12,\n                    60, 3, 36, 89, 45, 18, 4, 68, 48, 61, 30, 48, 26, 84, 49,\n                    3, 74, 51, 53, 30, 57, 50, 35, 74, 59, 30, 73, 19, 30, 82,\n                    78, 3, 5, 62, 17, 48, 29, 67, 52, 45, 61, 74, 52, 29, 61,\n                    63, 11, 89, 76, 34, 8, 50, 75, 42, 12, 5, 55, 0, 59, 44,\n                    68, 26, 76, 37, 50, 53, 73, 53, 76, 57, 40, 30, 52, 0, 41,\n                    21, 8, 79, 79, 38, 37, 50, 56, 43, 9, 85, 21, 60, 64, 13,\n                    54, 60, 83, 1, 2, 37, 75, 42, 0, 83, 81, 80, 87, 12, 15,\n                    75, 55, 41, 59, 9, 80, 66, 27, 65, 26, 48, 29, 37, 38, 9,\n                    76, 31, 39, 35, 22, 73, 59, 28, 33, 35, 63, 78, 17, 22, 82,\n                    12, 60, 49, 26, 54, 19, 60, 29, 39, 37, 10, 50, 12, 19, 29,\n                    1, 74, 12, 5, 38, 49, 41, 19, 88, 3, 27, 77, 81, 72, 42,\n                    71, 86, 82, 11, 79, 40, 35, 26, 35, 64, 4, 33, 87, 31, 84,\n                    81, 74, 31, 49, 0, 29, 73, 14, 55, 78, 21, 23, 20, 83, 48,\n                    89, 88, 62, 64, 73, 7, 20, 70, 81, 64, 3, 79, 38, 75, 13,\n                    40, 29, 82, 40, 14, 66, 56, 54, 52, 37, 14, 67, 8, 37, 1,\n                    5, 73, 14, 35, 63, 48, 46, 22, 84, 71, 2, 60, 63, 88, 14,\n                    15, 69, 88, 2, 43, 57, 43, 52, 18, 78, 75, 75, 74, 13, 35,\n                    50, 35, 17, 13, 64, 82, 55, 32, 14, 57, 35, 77, 65, 22, 40,\n                    27, 39, 80, 23, 20, 41, 50, 48, 22, 84, 37, 59, 45, 64, 10,\n                    3, 69, 56, 24, 4, 25, 76, 65, 47, 52, 64, 88, 3, 23, 37,\n                    16, 56, 69, 71, 27, 87, 65, 74, 23, 82, 41, 60, 78, 75, 22,\n                    51, 15, 57, 80, 46, 73, 7, 1, 36, 64, 0, 56, 83, 74, 62,\n                    73, 81, 68, 71, 63, 31, 5, 23, 11, 15, 39, 2, 10, 23, 18,\n                    74, 3, 43, 25, 68, 54, 11, 21, 14, 58, 10, 73, 0, 66, 28,\n                    73, 25, 40, 55, 56, 33, 81, 67, 43, 35, 65, 38, 21, 48, 81,\n                    4, 77, 68, 51, 38, 36, 49, 43, 33, 51, 28, 43, 60, 71, 78,\n                    48, 49, 76, 21, 0, 72, 0, 32, 78, 12, 87, 5, 80, 62, 40,\n                    85, 26, 70, 58, 56, 78, 7, 53, 30, 16, 22, 12, 23, 37, 83,\n                    45, 33, 41, 83, 78, 87, 44, 0, 65, 51, 3, 8, 72, 38, 14,\n                    24, 64, 77, 45, 5, 1, 7, 27, 82, 7, 6, 70, 25, 67, 22, 8,\n                    30, 76, 41, 11, 14, 1, 65, 85, 60, 80, 0, 30, 31, 79, 43,\n                    89, 33, 84, 22, 7, 67, 45, 39, 74, 75, 12, 61, 19, 71, 66,\n                    83, 57, 38, 45, 21, 18, 37, 54, 36, 14, 54, 63, 81, 12, 7,\n                    10, 39, 16, 40, 10, 7, 81, 45, 12, 22, 20, 29, 85, 40, 41,\n                    72, 79, 58, 50, 41, 59, 64, 41, 32, 56, 35, 8, 60, 17, 14,\n                    89, 17, 7, 48, 6, 35, 9, 34, 54, 6, 44, 87, 76, 50, 1, 67,\n                    70, 15, 8, 4, 45, 67, 86, 32, 69, 3, 88, 85, 72, 66, 21,\n                    89, 11, 77, 1, 50, 75, 56, 41, 74, 6, 4, 51, 65, 39, 50,\n                    45, 56, 3, 19, 80, 86, 55, 48, 81, 17, 3, 89, 7, 9, 63, 58,\n                    80, 39, 34, 85, 55, 71, 41, 55, 8, 63, 38, 51, 47, 49, 83,\n                    2, 73, 22, 39, 18, 45, 77, 56, 80, 54, 13, 23, 81, 54, 15,\n                    48, 57, 83, 71, 41, 32, 64, 1, 9, 46, 27, 16, 21, 7, 28,\n                    55, 17, 71, 68, 17, 74, 46, 38, 84, 3, 12, 71, 63, 16, 23,\n                    48, 12, 29, 28, 5, 21, 61, 14, 77, 66, 62, 57, 18, 30, 63,\n                    14, 41, 37, 30, 73, 16, 12, 74, 8, 82, 67, 53, 10, 5, 37,\n                    36, 39, 52, 37, 72, 76, 21, 35, 40, 42, 55, 47, 50, 41, 19,\n                    40, 86, 26, 54, 23, 74, 46, 66, 59, 80, 26, 81, 61, 80, 88,\n                    55, 40, 30, 45, 7, 46, 21, 3, 20, 46, 63, 18, 9, 34, 67, 9,\n                    19, 52, 53, 29, 69, 78, 65, 39, 71, 40, 38, 57, 80, 27, 34,\n                    30, 27, 55, 8, 65, 31, 37, 33, 25, 39, 46, 9, 83, 6, 27,\n                    28, 61, 9, 21, 58, 21, 10, 69, 24, 5, 31, 32, 44, 26, 84,\n                    73, 73, 9, 64, 26, 21, 85, 12, 39, 81, 38, 49, 24, 35, 3,\n                    88, 15, 15, 76, 64, 70, 9, 30, 51, 26, 16, 70, 60, 15, 7,\n                    54, 36, 32, 9, 10, 18, 66, 19, 25, 77, 46, 51, 51, 14, 41,\n                    56, 65, 41, 87, 26, 10, 2, 73, 2, 71, 26, 56, 10, 68, 15,\n                    53, 10, 43, 15, 22, 45, 2, 15, 16, 69, 80, 83, 18, 22, 70,\n                    77, 52, 48, 24, 17, 40, 56, 22, 17, 3, 36, 46, 37, 41, 22,\n                    0, 41, 45, 14, 15, 73, 18, 42, 34, 5, 87, 6, 2, 7, 58, 3,\n                    86, 87, 7, 79, 88, 33, 30, 48, 3, 66, 27, 34, 58, 48, 71,\n                    40, 1, 46, 84, 32, 63, 79, 0, 21, 71, 1, 59, 39, 77, 51,\n                    14, 20, 58, 83, 19, 0, 2, 2, 57, 73, 79, 42, 59, 33, 50,\n                    15, 11, 48, 25, 14, 39, 36, 88, 71, 28, 45, 15, 59, 39, 60,\n                    78, 18, 18, 45, 50, 29, 66, 86, 5, 76, 85, 55, 17, 28, 8,\n                    39, 75, 33, 9, 73, 71, 59, 56, 57, 86, 6, 75, 26, 43, 68,\n                    34, 82, 88, 76, 17, 86, 63, 2, 38, 63, 13, 44, 8, 25, 0,\n                    63, 54, 73, 52, 3, 72,\n                ]),\n                9,\n            ),\n            Set(Key(vec![]), 35),\n            Set(\n                Key(vec![\n                    165, 64, 99, 55, 152, 102, 148, 35, 59, 10, 198, 191, 71,\n                    129, 170, 155, 7, 106, 171, 93, 126,\n                ]),\n                212,\n            ),\n            Del(Key(vec![])),\n            Merge(Key(vec![]), 177),\n            Merge(\n                Key(vec![\n                    20, 55, 154, 104, 10, 68, 64, 3, 31, 78, 232, 227, 169,\n                    161, 13, 50, 16, 239, 87, 0, 9, 85, 248, 32, 156, 106, 11,\n                    18, 57, 13, 177, 36, 69, 176, 101, 92, 119, 38, 218, 26, 4,\n                    154, 185, 135, 75, 167, 101, 107, 206, 76, 153, 213, 70,\n                    52, 205, 95, 55, 116, 242, 68, 77, 90, 249, 142, 93, 135,\n                    118, 127, 116, 121, 235, 183, 215, 2, 118, 193, 146, 185,\n                    4, 129, 167, 164, 178, 105, 149, 47, 73, 121, 95, 23, 216,\n                    153, 23, 108, 141, 190, 250, 121, 98, 229, 33, 106, 89,\n                    117, 122, 145, 47, 242, 81, 88, 141, 38, 177, 170, 167, 56,\n                    24, 196, 61, 97, 83, 91, 202, 181, 75, 112, 3, 169, 61, 17,\n                    100, 81, 111, 178, 122, 176, 95, 185, 169, 146, 239, 40,\n                    168, 32, 170, 34, 172, 89, 59, 188, 170, 186, 61, 7, 177,\n                    230, 130, 155, 208, 171, 82, 153, 20, 72, 74, 111, 147,\n                    178, 164, 157, 71, 114, 216, 40, 85, 91, 20, 145, 149, 95,\n                    36, 114, 24, 129, 144, 229, 14, 133, 77, 92, 139, 167, 48,\n                    18, 178, 4, 15, 171, 171, 88, 74, 104, 157, 2, 121, 13,\n                    141, 6, 107, 118, 228, 147, 152, 28, 206, 128, 102, 150, 1,\n                    129, 84, 171, 119, 110, 198, 72, 100, 166, 153, 98, 66,\n                    128, 79, 41, 126,\n                ]),\n                103,\n            ),\n            Del(Key(vec![])),\n            Merge(\n                Key(vec![\n                    117, 48, 90, 153, 149, 191, 229, 73, 3, 6, 73, 52, 73, 186,\n                    42, 53, 94, 17, 61, 11, 153, 118, 219, 188, 184, 89, 13,\n                    124, 138, 40, 238, 9, 46, 45, 38, 115, 153, 106, 166, 56,\n                    134, 206, 140, 57, 95, 244, 27, 135, 43, 13, 143, 137, 56,\n                    122, 243, 205, 52, 116, 130, 35, 80, 167, 58, 93,\n                ]),\n                8,\n            ),\n            Set(Key(vec![145]), 43),\n            GetLt(Key(vec![229])),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_30() {\n    // postmortem:\n    prop_tree_matches_btreemap(\n        vec![\n            Merge(Key(vec![]), 241),\n            Set(Key(vec![20]), 146),\n            Merge(\n                Key(vec![\n                    60, 38, 29, 57, 35, 71, 15, 46, 7, 27, 76, 84, 27, 25, 90,\n                    30, 37, 63, 11, 24, 27, 28, 94, 93, 82, 68, 69, 61, 46, 86,\n                    11, 86, 63, 34, 90, 71, 92, 87, 38, 48, 40, 78, 9, 37, 26,\n                    36, 60, 4, 2, 38, 32, 73, 86, 43, 52, 79, 11, 43, 59, 21,\n                    60, 40, 80, 94, 69, 44, 4, 73, 59, 16, 16, 22, 88, 41, 13,\n                    21, 91, 33, 49, 91, 20, 79, 23, 61, 53, 63, 58, 62, 49, 10,\n                    71, 72, 27, 55, 53, 39, 91, 82, 86, 38, 41, 1, 54, 3, 77,\n                    15, 93, 31, 49, 29, 82, 7, 17, 58, 42, 12, 49, 67, 62, 46,\n                    20, 27, 61, 32, 58, 9, 17, 19, 28, 44, 41, 34, 94, 11, 50,\n                    73, 1, 50, 48, 8, 88, 33, 40, 51, 15, 35, 2, 36, 37, 30,\n                    37, 83, 71, 91, 32, 0, 69, 28, 64, 30, 72, 63, 39, 7, 89,\n                    0, 21, 51, 92, 80, 13, 57, 7, 53, 94, 26, 2, 63, 18, 23,\n                    89, 34, 83, 55, 32, 75, 81, 27, 11, 5, 63, 0, 75, 12, 39,\n                    9, 13, 20, 25, 57, 94, 75, 59, 46, 84, 80, 61, 24, 31, 7,\n                    68, 93, 12, 94, 6, 94, 27, 33, 81, 19, 3, 78, 3, 14, 22,\n                    36, 49, 61, 51, 79, 43, 35, 58, 54, 65, 72, 36, 87, 3, 3,\n                    25, 75, 82, 58, 75, 76, 29, 89, 1, 16, 64, 63, 85, 0, 47,\n                ]),\n                11,\n            ),\n            Merge(Key(vec![25]), 245),\n            Merge(Key(vec![119]), 152),\n            Scan(Key(vec![]), 31),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n*/\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_31() {\n    // postmortem:\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![1]), 212),\n            Set(Key(vec![12]), 174),\n            Set(Key(vec![]), 182),\n            Set(\n                Key(vec![\n                    12, 55, 46, 38, 40, 34, 44, 32, 19, 15, 28, 49, 35, 40, 55,\n                    35, 61, 9, 62, 18, 3, 58,\n                ]),\n                86,\n            ),\n            Scan(Key(vec![]), -18),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_32() {\n    // postmortem: the MAX_IVEC that predecessor used in reverse\n    // iteration was setting the first byte to 0 even though we\n    // no longer perform per-key prefix encoding.\n    prop_tree_matches_btreemap(\n        vec![Set(Key(vec![57]), 141), Scan(Key(vec![]), -40)],\n        false,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_33() {\n    // postmortem: the split point was being incorrectly\n    // calculated when using the simplified prefix technique.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![]), 91),\n            Set(Key(vec![1]), 216),\n            Set(Key(vec![85, 25]), 78),\n            Set(Key(vec![85]), 43),\n            GetLt(Key(vec![])),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n/*\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_34() {\n    // postmortem: a safety check was too aggressive when\n    // finding predecessors using the new simplified prefix\n    // encoding technique.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![9, 212]), 100),\n            Set(Key(vec![9]), 63),\n            Set(Key(vec![5]), 100),\n            Merge(Key(vec![]), 16),\n            Set(Key(vec![9, 70]), 188),\n            Scan(Key(vec![]), -40),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n*/\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_35() {\n    // postmortem: prefix lengths were being incorrectly\n    // handled on splits.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![207]), 29),\n            Set(Key(vec![192]), 218),\n            Set(Key(vec![121]), 167),\n            Set(Key(vec![189]), 40),\n            Set(Key(vec![85]), 197),\n            Set(Key(vec![185]), 58),\n            Set(Key(vec![84]), 97),\n            Set(Key(vec![23]), 34),\n            Set(Key(vec![47]), 162),\n            Set(Key(vec![39]), 92),\n            Set(Key(vec![46]), 173),\n            Set(Key(vec![33]), 202),\n            Set(Key(vec![8]), 113),\n            Set(Key(vec![17]), 228),\n            Set(Key(vec![8, 49]), 217),\n            Set(Key(vec![6]), 192),\n            Set(Key(vec![5]), 47),\n            Set(Key(vec![]), 5),\n            Set(Key(vec![0]), 103),\n            Set(Key(vec![1]), 230),\n            Set(Key(vec![0, 229]), 117),\n            Set(Key(vec![]), 112),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n/*\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_36() {\n    // postmortem: suffix truncation caused\n    // regions to be permanently inaccessible\n    // when applied to split points on index\n    // nodes.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![152]), 65),\n            Set(Key(vec![]), 227),\n            Set(Key(vec![101]), 23),\n            Merge(Key(vec![254]), 97),\n            Set(Key(vec![254, 5]), 207),\n            Scan(Key(vec![]), -30),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n*/\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_37() {\n    // postmortem: suffix truncation was so\n    // aggressive that it would cut into\n    // the prefix in the lo key sometimes.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![]), 82),\n            Set(Key(vec![2, 0]), 40),\n            Set(Key(vec![2, 0, 0]), 49),\n            Set(Key(vec![1]), 187),\n            Scan(Key(vec![]), 33),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n/*\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_38() {\n    // postmortem: Free pages were not being initialized in the\n    // pagecache properly.\n    for _ in 0..10 {\n        prop_tree_matches_btreemap(\n            vec![\n                Set(Key(vec![193]), 73),\n                Merge(Key(vec![117]), 216),\n                Set(Key(vec![221]), 176),\n                GetLt(Key(vec![123])),\n                Restart,\n            ],\n            false,\n            0,\n            256,\n        );\n    }\n}\n*/\n\n/*\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_39() {\n    // postmortem:\n    for _ in 0..100 {\n        prop_tree_matches_btreemap(\n            vec![\n                Set(\n                    Key(vec![\n                        67, 48, 34, 254, 61, 189, 196, 127, 26, 185, 244, 63,\n                        60, 63, 246, 194, 243, 177, 218, 210, 153, 126, 124,\n                        47, 160, 242, 157, 2, 51, 34, 88, 41, 44, 65, 58, 211,\n                        245, 74, 192, 101, 222, 68, 196, 250, 127, 231, 102,\n                        177, 246, 105, 190, 144, 113, 148, 71, 72, 149, 246,\n                        38, 95, 106, 42, 83, 65, 84, 73, 148, 34, 95, 88, 57,\n                        232, 219, 227, 74, 14, 5, 124, 106, 57, 244, 50, 81,\n                        93, 145, 111, 40, 190, 127, 227, 17, 242, 165, 194,\n                        171, 60, 6, 255, 176, 143, 131, 164, 217, 18, 123, 19,\n                        246, 183, 29, 0, 6, 39, 175, 57, 134, 166, 231, 47,\n                        254, 158, 163, 178, 78, 240, 108, 157, 72, 135, 34,\n                        236, 103, 192, 109, 31, 2, 72, 128, 242, 4, 113, 109,\n                        224, 120, 61, 169, 226, 131, 210, 33, 181, 91, 91, 197,\n                        223, 127, 26, 94, 158, 55, 57, 3, 184, 15, 30, 2, 222,\n                        39, 29, 12, 42, 14, 166, 176, 28, 13, 246, 11, 186, 8,\n                        247, 113, 253, 102, 227, 68, 111, 227, 238, 54, 150,\n                        11, 57, 155, 4, 75, 179, 17, 172, 42, 22, 199, 44, 242,\n                        211, 0, 39, 243, 221, 114, 86, 145, 22, 226, 108, 32,\n                        248, 42, 49, 191, 112, 1, 69, 101, 112, 251, 243, 252,\n                        83, 140, 132, 165,\n                    ]),\n                    250,\n                ),\n                Del(Key(vec![\n                    11, 77, 168, 37, 181, 169, 239, 146, 240, 211, 7, 115, 197,\n                    119, 46, 80, 240, 92, 221, 108, 208, 247, 221, 129, 108,\n                    13, 36, 21, 93, 11, 243, 103, 188, 39, 126, 77, 29, 32,\n                    206, 175, 199, 245, 71, 96, 221, 7, 68, 64, 45, 78, 68,\n                    193, 73, 13, 60, 13, 28, 167, 147, 7, 90, 11, 206, 44, 84,\n                    243, 3, 77, 122, 87, 7, 125, 184, 6, 178, 59,\n                ])),\n                Merge(Key(vec![176]), 123),\n                Restart,\n                Merge(\n                    Key(vec![\n                        93, 43, 181, 76, 63, 247, 227, 15, 17, 239, 9, 252,\n                        181, 53, 65, 74, 22, 18, 71, 64, 115, 58, 110, 30, 13,\n                        177, 31, 47, 124, 14, 0, 157, 200, 194, 92, 215, 21,\n                        36, 239, 204, 18, 88, 216, 149, 18, 208, 187, 188, 32,\n                        76, 35, 12, 142, 157, 38, 186, 245, 63, 2, 230, 13, 79,\n                        160, 86, 32, 170, 239, 151, 25, 180, 170, 201, 22, 211,\n                        238, 208, 24, 139, 5, 44, 38, 48, 243, 38, 249, 36, 43,\n                        200, 52, 244, 166, 0, 29, 114, 10, 18, 253, 253, 130,\n                        223, 37, 8, 109, 228, 0, 122, 192, 16, 68, 231, 37,\n                        230, 249, 180, 214, 101, 17,\n                    ]),\n                    176,\n                ),\n                Set(\n                    Key(vec![\n                        153, 217, 142, 179, 255, 74, 1, 20, 254, 1, 38, 28, 66,\n                        244, 81, 101, 210, 58, 18, 107, 12, 116, 74, 188, 95,\n                        56, 248, 9, 204, 128, 24, 239, 143, 83, 83, 213, 17,\n                        32, 135, 73, 217, 8, 241, 44, 57, 131, 107, 139, 122,\n                        32, 194, 225, 136, 148, 227, 196, 196, 121, 97, 81, 74,\n                    ]),\n                    42,\n                ),\n                Set(Key(vec![]), 160),\n                GetLt(Key(vec![\n                    244, 145, 243, 120, 149, 64, 125, 161, 98, 205, 205, 107,\n                    191, 119, 83, 42, 92, 119, 25, 198, 47, 123, 26, 224, 190,\n                    98, 144, 238, 74, 36, 76, 186, 226, 153, 69, 217, 109, 214,\n                    201, 104, 148, 107, 132, 219, 37, 109, 98, 172, 70, 160,\n                    177, 115, 194, 80, 76, 60, 148, 176, 191, 84, 109, 35, 51,\n                    107, 157, 11, 233, 126, 71, 183, 215, 116, 72, 235, 218,\n                    171, 233, 181, 53, 253, 104, 231, 138, 166, 40,\n                ])),\n                Set(\n                    Key(vec![\n                        37, 160, 29, 162, 43, 212, 2, 100, 236, 24, 2, 82, 58,\n                        38, 81, 137, 89, 55, 164, 83,\n                    ]),\n                    64,\n                ),\n                Get(Key(vec![\n                    15, 53, 101, 33, 156, 199, 212, 82, 2, 64, 136, 70, 235,\n                    72, 170, 188, 180, 200, 109, 231, 6, 13, 30, 70, 4, 132,\n                    133, 101, 82, 187, 78, 241, 157, 49, 156, 3, 17, 167, 216,\n                    209, 7, 174, 112, 186, 170, 189, 85, 99, 119, 52, 39, 38,\n                    151, 108, 203, 42, 63, 255, 216, 234, 34, 2, 80, 168, 122,\n                    70, 20, 11, 220, 106, 49, 110, 165, 170, 149, 163,\n                ])),\n                GetLt(Key(vec![])),\n                Merge(Key(vec![136]), 135),\n                Cas(Key(vec![177]), 159, 209),\n                Cas(Key(vec![101]), 143, 240),\n                Set(Key(vec![226, 62, 34, 63, 172, 96, 162]), 43),\n                Merge(\n                    Key(vec![\n                        48, 182, 144, 255, 137, 100, 2, 139, 69, 111, 159, 133,\n                        234, 147, 118, 231, 155, 74, 73, 98, 58, 36, 35, 21,\n                        50, 42, 71, 25, 200, 5, 4, 198, 158, 41, 88, 75, 153,\n                        254, 248, 213, 0, 89, 43, 160, 58, 206, 88, 107, 57,\n                        208, 119, 34, 80, 166, 112, 13, 241, 46, 172, 115, 179,\n                        42, 59, 200, 225, 125, 65, 18, 173, 77, 27, 129, 228,\n                        68, 53, 175, 61, 230, 27, 136, 131, 171, 64, 79, 125,\n                        149, 52, 80,\n                    ]),\n                    105,\n                ),\n                Merge(\n                    Key(vec![\n                        126, 109, 165, 43, 2, 82, 97, 81, 59, 78, 243, 142, 37,\n                        105, 109, 178, 25, 73, 50, 103, 107, 129, 213, 193,\n                        158, 16, 63, 108, 160, 204, 78, 83, 2, 43, 66, 2, 18,\n                        11, 147, 47, 106, 106, 141, 82, 65, 101, 99, 171, 178,\n                        68, 106, 7, 190, 159, 105, 132, 155, 240, 155, 95, 66,\n                        254, 239, 202, 168, 26, 207, 213, 116, 215, 141, 77, 7,\n                        245, 174, 144, 39, 28,\n                    ]),\n                    122,\n                ),\n                Del(Key(vec![\n                    13, 152, 171, 90, 130, 131, 232, 51, 173, 103, 255, 225,\n                    156, 192, 146, 141, 94, 84, 39, 171, 152, 114, 133, 20,\n                    125, 68, 57, 27, 33, 175, 37, 164, 40,\n                ])),\n                Scan(Key(vec![]), -34),\n                Set(Key(vec![]), 85),\n                Merge(Key(vec![112]), 104),\n                Restart,\n                Restart,\n                Del(Key(vec![237])),\n                Set(\n                    Key(vec![\n                        53, 79, 71, 234, 187, 78, 206, 117, 48, 84, 162, 101,\n                        132, 137, 43, 144, 234, 23, 116, 13, 28, 184, 174, 241,\n                        181, 201, 131, 156, 7, 103, 135, 17, 168, 249, 7, 120,\n                        74, 8, 192, 134, 109, 54, 175, 130, 145, 206, 185, 49,\n                        144, 133, 226, 244, 42, 126, 176, 232, 96, 56, 70, 56,\n                        159, 127, 35, 39, 185, 114, 182, 41, 50, 93, 61,\n                    ]),\n                    144,\n                ),\n                Merge(\n                    Key(vec![\n                        10, 58, 6, 62, 17, 15, 26, 29, 79, 34, 77, 12, 93, 65,\n                        87, 71, 19, 57, 25, 40, 53, 73, 57, 2, 81, 49, 67, 62,\n                        78, 14, 34, 70, 86, 49, 86, 84, 16, 33, 24, 7, 87, 49,\n                        58, 50, 13, 14, 35, 46, 7, 39, 76, 51, 21, 76, 9, 53,\n                        45, 21, 71, 48, 16, 73, 68, 1, 63, 34, 12, 42, 11, 85,\n                        79, 19, 11, 77, 90, 0, 62, 56, 37, 33, 10, 69, 20, 64,\n                        15, 51, 64, 90, 69, 15, 7, 41, 53, 71, 52, 21, 45, 45,\n                        49, 3, 59, 15, 90, 7, 12, 62, 30, 81,\n                    ]),\n                    131,\n                ),\n                Get(Key(vec![\n                    79, 28, 48, 41, 5, 70, 54, 56, 36, 32, 59, 15, 26, 42, 61,\n                    23, 53, 6, 71, 44, 61, 65, 4, 17, 23, 15, 65, 64, 46, 66,\n                    27, 63, 51, 44, 35, 1, 8, 70, 7, 1, 13, 10, 40, 6, 36, 64,\n                    68, 52, 8, 0, 46, 53, 48, 32, 9, 52, 69, 41, 8, 57, 27, 31,\n                    79, 27, 12, 70, 72, 33, 6, 22, 47, 37, 11, 38, 32, 7, 31,\n                    37, 45, 23, 74, 22, 46, 1, 3, 74, 72, 56, 52, 65, 78, 28,\n                    5, 68, 30, 36, 5, 43, 7, 2, 48, 75, 16, 53, 31, 40, 9, 3,\n                    49, 71, 70, 20, 24, 6, 23, 76, 49, 21, 12, 60, 54, 43, 7,\n                    79, 74, 62, 53, 20, 46, 11, 74, 29, 31, 43, 20, 27, 22, 22,\n                    15, 59, 12, 21, 61, 11, 8, 28, 5, 78, 70, 22, 11, 36, 62,\n                    56, 44, 49, 25, 39, 37, 24, 72, 65, 67, 22, 48, 16, 50, 5,\n                    10, 13, 36, 65, 29, 3, 26, 74, 15, 73, 78, 36, 14, 36, 30,\n                    42, 19, 73, 65, 75, 2, 25, 1, 32, 38, 43, 58, 19, 37, 37,\n                    48, 23, 72, 77, 34, 24, 1, 4, 42, 11, 68, 54, 23, 34, 0,\n                    48, 20, 20, 23, 61, 65, 72, 64, 24, 63, 3, 21, 48, 63, 57,\n                    40, 36, 46, 48, 8, 20, 62, 7, 69, 35, 79, 38, 45, 74, 7,\n                    16, 48, 59, 56, 31, 13, 13,\n                ])),\n                Del(Key(vec![176, 58, 119])),\n                Get(Key(vec![241])),\n                Get(Key(vec![160])),\n                Cas(Key(vec![]), 166, 235),\n                Set(\n                    Key(vec![\n                        64, 83, 151, 149, 100, 93, 5, 18, 91, 58, 84, 156, 127,\n                        108, 99, 168, 54, 51, 169, 185, 174, 101, 178, 148, 28,\n                        91, 25, 138, 14, 133, 170, 97, 138, 180, 157, 131, 174,\n                        22, 91, 108, 59, 165, 52, 28, 17, 175, 44, 95, 112, 38,\n                        141, 46, 124, 49, 116, 55, 39, 109, 73, 181, 104, 86,\n                        81, 150, 95, 149, 69, 110, 110, 102, 22, 62, 180, 60,\n                        87, 127, 127, 136, 12, 139, 109, 165, 34, 181, 158,\n                        156, 102, 38, 6, 149, 183, 69, 129, 98, 161, 175, 82,\n                        51, 47, 93, 136, 16, 118, 65, 152, 139, 8, 30, 10, 100,\n                        47, 13, 47, 179, 87, 19, 109, 78, 116, 20, 111, 89, 28,\n                        0, 86, 39, 139, 7, 111, 40, 145, 155, 107, 45, 36, 90,\n                        143, 154, 135, 36, 13, 98, 61, 150, 65, 128, 16, 52,\n                        100, 128, 11, 5, 49, 143, 56, 78, 48, 62, 86, 50, 86,\n                        41, 153, 53, 139, 89, 164, 33, 136, 83, 182, 53, 132,\n                        144, 177, 105, 104, 55, 9, 174, 30, 65, 76, 33, 163,\n                        172, 80, 169, 175, 54, 165, 173, 109, 24, 70, 25, 158,\n                        135, 76, 130, 76, 9, 56, 20, 13, 133, 33, 168, 160,\n                        153, 43, 80, 58, 56, 171, 28, 97, 122, 162, 32, 164,\n                        11, 112, 177, 63, 47, 25, 0, 66, 87, 169, 118, 173, 27,\n                        154, 79, 72, 107, 140, 126, 150, 60, 174, 184, 111,\n                        155, 22, 32, 185, 149, 95, 60, 146, 165, 103, 34, 131,\n                        91, 92, 85, 6, 102, 172, 131, 178, 141, 76, 84, 121,\n                        49, 19, 66, 127, 45, 23, 159, 33, 138, 47, 36, 106, 39,\n                        83, 164, 83, 16, 126, 126, 118, 84, 171,\n                    ]),\n                    143,\n                ),\n                Scan(Key(vec![165]), -26),\n                Get(Key(vec![])),\n                Del(Key(vec![])),\n                Set(\n                    Key(vec![\n                        197, 224, 20, 219, 111, 246, 70, 138, 190, 237, 9, 202,\n                        187, 160, 47, 10, 231, 14, 2, 131, 30, 202, 95, 48, 44,\n                        21, 192, 155, 172, 51, 101, 155, 73, 5, 22, 140, 137,\n                        11, 37, 79, 79, 92, 25, 107, 82, 145, 39, 45, 155, 136,\n                        242, 8, 43, 71, 28, 70, 94, 79, 151, 20, 144, 53, 100,\n                        196, 74, 140, 27, 224, 59, 1, 143, 136, 132, 85, 114,\n                        166, 103, 242, 156, 183, 168, 148, 2, 33, 29, 201, 7,\n                        96, 13, 33, 102, 172, 21, 96, 27, 1, 86, 149, 150, 119,\n                        208, 118, 148, 51, 143, 54, 245, 89, 216, 145, 145, 72,\n                        105, 51, 19, 14, 15, 18, 34, 16, 101, 172, 133, 32,\n                        173, 106, 157, 15, 48, 194, 27, 55, 204, 110, 145, 99,\n                        9, 37, 195, 206, 13, 246, 161, 100, 222, 235, 184, 12,\n                        64, 103, 50, 158, 242, 163, 198, 61, 224, 130, 226,\n                        187, 158, 175, 135, 54, 110, 33, 9, 59, 127, 135, 47,\n                        204, 109, 105, 0, 161, 48, 247, 140, 101, 141, 81, 157,\n                        80, 135, 228, 102, 44, 74, 53, 121, 116, 17, 56, 26,\n                        112,\n                    ]),\n                    22,\n                ),\n                Set(Key(vec![110]), 222),\n                Set(Key(vec![94]), 5),\n                GetGt(Key(vec![\n                    181, 161, 96, 186, 128, 24, 232, 74, 149, 3, 129, 98, 220,\n                    25, 111, 111, 163, 244, 229, 137, 159, 137, 13, 12, 97,\n                    150, 6, 88, 76, 77, 31, 36, 57, 54, 82, 85, 119, 250, 187,\n                    163, 132, 73, 194, 129, 149, 176, 62, 118, 166, 50, 200,\n                    28, 158, 184, 28, 139, 74, 87, 144, 87, 1, 73, 37, 46, 226,\n                    91, 102, 13, 67, 195, 64, 189, 90, 190, 163, 216, 171, 22,\n                    69, 234, 57, 134, 96, 198, 179, 115, 43, 160, 104, 252,\n                    105, 192, 91, 211, 176, 171, 252, 236, 202, 158, 250, 186,\n                    134, 154, 82, 17, 113, 175, 13, 125, 185, 101, 38, 236,\n                    155, 30, 110, 11, 33, 198, 114, 184, 84, 91, 67, 125, 55,\n                    188, 124, 242, 89, 124, 69, 18, 26, 137, 34, 33, 201, 58,\n                    252, 134, 33, 131, 126, 136, 168, 20, 32, 237, 10, 57, 158,\n                    149, 102, 62, 10, 98, 106, 10, 93, 78, 240, 205, 38, 186,\n                    97, 104, 204, 14, 34, 100, 179, 161, 135, 136, 194, 99,\n                ])),\n                Merge(Key(vec![95]), 253),\n                GetLt(Key(vec![99])),\n                Merge(Key(vec![]), 124),\n                Get(Key(vec![61])),\n                Restart,\n            ],\n            false,\n            0,\n            256,\n        );\n    }\n}\n*/\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_40() {\n    // postmortem: deletions of non-existant keys were\n    // being persisted despite being unneccessary.\n    prop_tree_matches_btreemap(\n        vec![Del(Key(vec![99; 111222333]))],\n        false,\n        0,\n        256,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_41() {\n    // postmortem: indexing of values during\n    // iteration was incorrect.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![]), 131),\n            Set(Key(vec![17; 1]), 214),\n            Set(Key(vec![4; 1]), 202),\n            Set(Key(vec![24; 1]), 79),\n            Set(Key(vec![26; 1]), 235),\n            Scan(Key(vec![]), 19),\n        ],\n        false,\n        0,\n        256,\n    );\n}\n\n/*\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_42() {\n    // postmortem: during refactoring, accidentally\n    // messed up the index selection for merge destinations.\n    for _ in 0..100 {\n        prop_tree_matches_btreemap(\n            vec![\n                Merge(Key(vec![]), 112),\n                Set(Key(vec![110; 1]), 153),\n                Set(Key(vec![15; 1]), 100),\n                Del(Key(vec![110; 1])),\n                GetLt(Key(vec![148; 1])),\n            ],\n            false,\n            0,\n            256,\n        );\n    }\n}\n*/\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_43() {\n    // postmortem: when changing the PageState to always\n    // include a base node, we did not account for this\n    // in the tag + size compressed value. This was not\n    // caught by the quickcheck tests because PageState's\n    // Arbitrary implementation would ensure that at least\n    // one frag was present, which was the invariant before\n    // the base was extracted away from the vec of frags.\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![241; 1]), 199),\n            Set(Key(vec![]), 198),\n            Set(Key(vec![72; 108]), 175),\n            GetLt(Key(vec![])),\n            Restart,\n            Restart,\n        ],\n        false,\n        0,\n        288,\n    );\n}\n\n/*\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_44() {\n    // postmortem: off-by-one bug related to LSN recovery\n    // where 1 was added to the index when the recovered\n    // LSN was actually divisible by the segment size\n    assert!(prop_tree_matches_btreemap(\n        vec![\n            Merge(Key(vec![]), 97),\n            Merge(Key(vec![]), 41),\n            Merge(Key(vec![]), 241),\n            Set(Key(vec![21; 1]), 24),\n            Del(Key(vec![])),\n            Set(Key(vec![]), 145),\n            Set(Key(vec![151; 1]), 187),\n            Get(Key(vec![])),\n            Restart,\n            Set(Key(vec![]), 151),\n            Restart,\n        ],\n        false,\n        0,\n        256\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_45() {\n    // postmortem: recovery was not properly accounting for\n    // the possibility of a segment to be maxed out, similar\n    // to bug 44.\n    for _ in 0..10 {\n        assert!(prop_tree_matches_btreemap(\n            vec![\n                Merge(Key(vec![206; 77]), 225),\n                Set(Key(vec![88; 190]), 40),\n                Set(Key(vec![162; 1]), 213),\n                Merge(Key(vec![186; 1]), 175),\n                Set(Key(vec![105; 16]), 111),\n                Cas(Key(vec![]), 75, 252),\n                Restart\n            ],\n            false,\n            true,\n            0,\n            210\n        ))\n    }\n}\n*/\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_46() {\n    // postmortem: while implementing the heap slab, decompression\n    // was failing to account for the fact that the slab allocator\n    // will always write to the end of the slab to be compatible\n    // with O_DIRECT.\n    for _ in 0..1 {\n        assert!(prop_tree_matches_btreemap(vec![Restart], false, 0, 256))\n    }\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_47() {\n    // postmortem:\n    assert!(prop_tree_matches_btreemap(\n        vec![Set(Key(vec![88; 1]), 40), Restart, Get(Key(vec![88; 1]))],\n        false,\n        0,\n        256\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_48() {\n    // postmortem: node value buffer calculations were failing to\n    // account for potential padding added to avoid buffer overreads\n    // while looking up offsets.\n    assert!(prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![23; 1]), 78),\n            Set(Key(vec![120; 1]), 223),\n            Set(Key(vec![123; 1]), 235),\n            Set(Key(vec![60; 1]), 234),\n            Set(Key(vec![]), 71),\n            Del(Key(vec![120; 1])),\n            Scan(Key(vec![]), -9)\n        ],\n        false,\n        0,\n        256\n    ))\n}\n\n/*\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_49() {\n    // postmortem: was incorrectly calculating the child offset while searching\n    // for a node with omitted keys, where the distance == the stride, and\n    // as a result we went into an infinite loop trying to apply a parent\n    // split that was already present\n    assert!(prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![39; 1]), 245),\n            Set(Key(vec![108; 1]), 96),\n            Set(Key(vec![147; 1]), 44),\n            Set(Key(vec![102; 1]), 2),\n            Merge(Key(vec![22; 1]), 160),\n            Set(Key(vec![36; 1]), 1),\n            Set(Key(vec![65; 1]), 213),\n            Set(Key(vec![]), 221),\n            Set(Key(vec![84; 1]), 20),\n            Merge(Key(vec![229; 1]), 61),\n            Set(Key(vec![156; 1]), 69),\n            Merge(Key(vec![252; 1]), 85),\n            Set(Key(vec![36; 2]), 57),\n            Set(Key(vec![245; 1]), 143),\n            Set(Key(vec![59; 1]), 209),\n            GetGt(Key(vec![136; 1])),\n            Set(Key(vec![40; 1]), 96),\n            GetGt(Key(vec![59; 2]))\n        ],\n        false,\n        false,\n        0,\n        0\n    ))\n}\n*/\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_50() {\n    // postmortem: node value buffer calculations were failing to\n    // account for potential padding added to avoid buffer overreads\n    // while looking up offsets.\n    assert!(prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![1; 1]), 44),\n            Set(Key(vec![52; 1]), 108),\n            Set(Key(vec![80; 1]), 177),\n            Set(Key(vec![225; 1]), 59),\n            Set(Key(vec![246; 1]), 34),\n            Set(Key(vec![51; 1]), 233),\n            Set(Key(vec![]), 88),\n            GetLt(Key(vec![1; 1]))\n        ],\n        false,\n        0,\n        0\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_51() {\n    // postmortem:\n    prop_tree_matches_btreemap(\n        vec![Set(Key(vec![]), 135), Restart, Scan(Key(vec![]), -38)],\n        false,\n        0,\n        0,\n    );\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_bug_52() {\n    // postmortem:\n    prop_tree_matches_btreemap(\n        vec![\n            Set(Key(vec![57; 1]), 235),\n            Set(Key(vec![229; 1]), 136),\n            Set(Key(vec![]), 74),\n            Set(Key(vec![57; 2]), 0),\n            Get(Key(vec![57; 1])),\n            GetGt(Key(vec![57; 1])),\n            Get(Key(vec![57; 2])),\n            GetLt(Key(vec![57; 2])),\n            Scan(Key(vec![]), 4),\n        ],\n        false,\n        0,\n        0,\n    );\n}\n"
  },
  {
    "path": "tests/common/mod.rs",
    "content": "// the memshred feature causes all allocated and deallocated\n// memory to be set to a specific non-zero value of 0xa1 for\n// uninitialized allocations and 0xde for deallocated memory,\n// in the hope that it will cause memory errors to surface\n// more quickly.\n#[cfg(feature = \"testing-shred-allocator\")]\nmod alloc {\n    use std::alloc::{Layout, System};\n\n    #[global_allocator]\n    static ALLOCATOR: Alloc = Alloc;\n\n    #[derive(Default, Debug, Clone, Copy)]\n    struct Alloc;\n\n    unsafe impl std::alloc::GlobalAlloc for Alloc {\n        unsafe fn alloc(&self, layout: Layout) -> *mut u8 {\n            let ret = System.alloc(layout);\n            assert_ne!(ret, std::ptr::null_mut());\n            std::ptr::write_bytes(ret, 0xa1, layout.size());\n            ret\n        }\n\n        unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {\n            std::ptr::write_bytes(ptr, 0xde, layout.size());\n            System.dealloc(ptr, layout)\n        }\n    }\n}\n\npub fn setup_logger() {\n    use std::io::Write;\n\n    fn tn() -> String {\n        std::thread::current().name().unwrap_or(\"unknown\").to_owned()\n    }\n\n    let mut builder = env_logger::Builder::new();\n    builder\n        .format(|buf, record| {\n            writeln!(\n                buf,\n                \"{:05} {:20} {:10} {}\",\n                record.level(),\n                tn(),\n                record.module_path().unwrap().split(\"::\").last().unwrap(),\n                record.args()\n            )\n        })\n        .filter(None, log::LevelFilter::Info);\n\n    if let Ok(env) = std::env::var(\"RUST_LOG\") {\n        builder.parse_filters(&env);\n    }\n\n    let _r = builder.try_init();\n}\n\n#[allow(dead_code)]\npub fn cleanup(dir: &str) {\n    let dir = std::path::Path::new(dir);\n    if dir.exists() {\n        std::fs::remove_dir_all(dir).unwrap();\n    }\n}\n"
  },
  {
    "path": "tests/concurrent_batch_atomicity.rs",
    "content": "use std::sync::{Arc, Barrier};\nuse std::thread;\n\nuse sled::{Config, Db as SledDb};\n\nconst CONCURRENCY: usize = 32;\nconst N_KEYS: usize = 1024;\n\ntype Db = SledDb<8>;\n\nfn batch_writer(db: Db, barrier: Arc<Barrier>, thread_number: usize) {\n    barrier.wait();\n    let mut batch = sled::Batch::default();\n    for key_number in 0_u128..N_KEYS as _ {\n        // LE is intentionally a little scrambled\n        batch.insert(&key_number.to_le_bytes(), &thread_number.to_le_bytes());\n    }\n\n    db.apply_batch(batch).unwrap();\n}\n\n#[test]\nfn concurrent_batch_atomicity() {\n    let db: Db = Config {\n        path: \"concurrent_batch_atomicity\".into(),\n        ..Default::default()\n    }\n    .open()\n    .unwrap();\n\n    let mut threads = vec![];\n\n    let flusher_barrier = Arc::new(Barrier::new(CONCURRENCY));\n    for tn in 0..CONCURRENCY {\n        let db = db.clone();\n        let barrier = flusher_barrier.clone();\n        let thread = thread::Builder::new()\n            .name(format!(\"t(thread: {} flusher)\", tn))\n            .spawn(move || {\n                db.flush().unwrap();\n                barrier.wait();\n            })\n            .expect(\"should be able to spawn thread\");\n        threads.push(thread);\n    }\n\n    let barrier = Arc::new(Barrier::new(CONCURRENCY + 1));\n    for thread_number in 0..CONCURRENCY {\n        let db = db.clone();\n        let barrier = barrier.clone();\n        let jh =\n            thread::spawn(move || batch_writer(db, barrier, thread_number));\n        threads.push(jh);\n    }\n\n    barrier.wait();\n    let before = std::time::Instant::now();\n\n    for thread in threads.into_iter() {\n        thread.join().unwrap();\n    }\n\n    println!(\"writers took {:?}\", before.elapsed());\n\n    let mut expected_v = None;\n\n    for key_number in 0_u128..N_KEYS as _ {\n        let actual_v = db.get(&key_number.to_le_bytes()).unwrap().unwrap();\n        if expected_v.is_none() {\n            expected_v = Some(actual_v.clone());\n        }\n        assert_eq!(Some(actual_v), expected_v);\n    }\n\n    let _ = std::fs::remove_dir_all(\"concurrent_batch_atomicity\");\n}\n"
  },
  {
    "path": "tests/crash_tests/crash_batches.rs",
    "content": "use std::thread;\n\nuse rand::Rng;\n\nuse super::*;\n\nconst CACHE_SIZE: usize = 1024 * 1024;\nconst BATCH_SIZE: u32 = 8;\nconst SEGMENT_SIZE: usize = 1024;\n\n/// Verifies that the keys in the tree are correctly recovered (i.e., equal).\n/// Panics if they are incorrect.\nfn verify_batches(tree: &Db) -> u32 {\n    let mut iter = tree.iter();\n    let first_value = match iter.next() {\n        Some(Ok((_k, v))) => slice_to_u32(&*v),\n        Some(Err(e)) => panic!(\"{:?}\", e),\n        None => return 0,\n    };\n\n    // we now expect all items in the batch to be present and to have the same value\n\n    for key in 0..BATCH_SIZE {\n        let res = tree.get(u32_to_vec(key));\n        let option = res.unwrap();\n        let v = match option {\n            Some(v) => v,\n            None => panic!(\n                \"expected key {} to have a value, instead it was missing in db with keys: {}\",\n                key,\n                tree_to_string(&tree)\n            ),\n        };\n        let value = slice_to_u32(&*v);\n        // FIXME BUG 1 count 2\n        // assertion `left == right` failed: expected key 0 to have value 62003, instead it had value 62375 in db with keys:\n        // {0:62003, 1:62003, 2:62003, 3:62003, 4:62003, 5:62003, 6:62003, 7:62003,\n        // Human: iterating shows correct value, but first get did not\n        //\n        // expected key 1 to have value 1, instead it had value 29469 in db with keys:\n        // {0:1, 1:29469, 2:29469, 3:29469, 4:29469, 5:29469, 6:29469, 7:29469,\n        // Human: 0 didn't get included in later syncs\n        //\n        //  expected key 0 to have value 59485, instead it had value 59484 in db with keys:\n        //  {0:59485, 1:59485, 2:59485, 3:59485, 4:59485, 5:59485, 6:59485, 7:59485,\n        //  Human: had key N during first check, then N + 1 in iteration\n        assert_eq!(\n            first_value,\n            value,\n            \"expected key {} to have value {}, instead it had value {}. second get: {:?}. db iter: {}. third get: {:?}\",\n            key,\n            first_value,\n            value,\n            slice_to_u32(&*tree.get(u32_to_vec(key)).unwrap().unwrap()),\n            tree_to_string(&tree),\n            slice_to_u32(&*tree.get(u32_to_vec(key)).unwrap().unwrap()),\n        );\n    }\n\n    first_value\n}\n\nfn run_batches_inner(db: Db) {\n    fn do_batch(i: u32, db: &Db) {\n        let mut rng = rand::rng();\n        let base_value = u32_to_vec(i);\n\n        let mut batch = sled::Batch::default();\n        if rng.random_bool(0.1) {\n            for key in 0..BATCH_SIZE {\n                batch.remove(u32_to_vec(key));\n            }\n        } else {\n            for key in 0..BATCH_SIZE {\n                let mut value = base_value.clone();\n                let additional_len = rng.random_range(0..SEGMENT_SIZE / 3);\n                value.append(&mut vec![0u8; additional_len]);\n\n                batch.insert(u32_to_vec(key), value);\n            }\n        }\n        db.apply_batch(batch).unwrap();\n    }\n\n    let mut i = verify_batches(&db);\n    i += 1;\n    do_batch(i, &db);\n\n    loop {\n        i += 1;\n        do_batch(i, &db);\n    }\n}\n\npub fn run_crash_batches() {\n    let crash_during_initialization = rand::rng().random_ratio(1, 10);\n\n    if crash_during_initialization {\n        spawn_killah();\n    }\n\n    let path = std::path::Path::new(CRASH_DIR).join(BATCHES_DIR);\n    let config = Config::new()\n        .cache_capacity_bytes(CACHE_SIZE)\n        .flush_every_ms(Some(1))\n        .path(path);\n\n    let db = config.open().expect(\"couldn't open batch db\");\n    let db2 = db.clone();\n\n    let t1 = thread::spawn(|| run_batches_inner(db));\n    let t2 = thread::spawn(move || {\n        loop {\n            db2.flush().unwrap();\n        }\n    }); // run_batches_inner(db2));\n\n    if !crash_during_initialization {\n        spawn_killah();\n    }\n\n    let Err(e) = t1.join().and_then(|_| t2.join());\n\n    println!(\"worker thread failed: {:?}\", e);\n    std::process::exit(15);\n}\n"
  },
  {
    "path": "tests/crash_tests/crash_heap.rs",
    "content": "use super::*;\n\nconst FANOUT: usize = 3;\n\npub fn run_crash_heap() {\n    let path = std::path::Path::new(CRASH_DIR).join(HEAP_DIR);\n    let config = Config::new().path(path);\n\n    let HeapRecovery { heap, recovered_nodes, was_recovered } =\n        Heap::recover(FANOUT, &config).unwrap();\n\n    // validate\n\n    spawn_killah();\n\n    loop {}\n}\n"
  },
  {
    "path": "tests/crash_tests/crash_iter.rs",
    "content": "use std::sync::{Arc, Barrier};\nuse std::thread;\n\nuse super::*;\n\nconst CACHE_SIZE: usize = 256;\n\npub fn run_crash_iter() {\n    const N_FORWARD: usize = 50;\n    const N_REVERSE: usize = 50;\n\n    let path = std::path::Path::new(CRASH_DIR).join(ITER_DIR);\n    let config = Config::new()\n        .cache_capacity_bytes(CACHE_SIZE)\n        .path(path)\n        .flush_every_ms(Some(1));\n\n    let db: Db = config.open().expect(\"couldn't open iter db\");\n    let t = db.open_tree(b\"crash_iter_test\").unwrap();\n\n    thread::Builder::new()\n        .name(\"crash_iter_flusher\".to_string())\n        .spawn({\n            let t = t.clone();\n            move || loop {\n                t.flush().unwrap();\n            }\n        })\n        .unwrap();\n\n    const INDELIBLE: [&[u8]; 16] = [\n        &[0u8],\n        &[1u8],\n        &[2u8],\n        &[3u8],\n        &[4u8],\n        &[5u8],\n        &[6u8],\n        &[7u8],\n        &[8u8],\n        &[9u8],\n        &[10u8],\n        &[11u8],\n        &[12u8],\n        &[13u8],\n        &[14u8],\n        &[15u8],\n    ];\n\n    for item in &INDELIBLE {\n        t.insert(*item, *item).unwrap();\n    }\n    t.flush().unwrap();\n\n    let barrier = Arc::new(Barrier::new(N_FORWARD + N_REVERSE + 2));\n    let mut threads = vec![];\n\n    for i in 0..N_FORWARD {\n        let t = thread::Builder::new()\n            .name(format!(\"forward({})\", i))\n            .spawn({\n                let t = t.clone();\n                let barrier = barrier.clone();\n                move || {\n                    barrier.wait();\n                    loop {\n                        let expected = INDELIBLE.iter();\n                        let mut keys = t.iter().keys();\n\n                        for expect in expected {\n                            loop {\n                                let k = keys.next().unwrap().unwrap();\n                                assert!(\n                                    &*k <= *expect,\n                                    \"witnessed key is {:?} but we expected \\\n                                     one <= {:?}, so we overshot due to a \\\n                                     concurrent modification\",\n                                    k,\n                                    expect,\n                                );\n                                if &*k == *expect {\n                                    break;\n                                }\n                            }\n                        }\n                    }\n                }\n            })\n            .unwrap();\n        threads.push(t);\n    }\n\n    for i in 0..N_REVERSE {\n        let t = thread::Builder::new()\n            .name(format!(\"reverse({})\", i))\n            .spawn({\n                let t = t.clone();\n                let barrier = barrier.clone();\n                move || {\n                    barrier.wait();\n                    loop {\n                        let expected = INDELIBLE.iter().rev();\n                        let mut keys = t.iter().keys().rev();\n\n                        for expect in expected {\n                            loop {\n                                if let Some(Ok(k)) = keys.next() {\n                                    assert!(\n                                        &*k >= *expect,\n                                        \"witnessed key is {:?} but we expected \\\n                                         one >= {:?}, so we overshot due to a \\\n                                         concurrent modification\\n{:?}\",\n                                        k,\n                                        expect,\n                                        t,\n                                    );\n                                    if &*k == *expect {\n                                        break;\n                                    }\n                                } else {\n                                    panic!(\"undershot key on tree: \\n{:?}\", t);\n                                }\n                            }\n                        }\n                    }\n                }\n            })\n            .unwrap();\n\n        threads.push(t);\n    }\n\n    let inserter = thread::Builder::new()\n        .name(\"inserter\".into())\n        .spawn({\n            let t = t.clone();\n            let barrier = barrier.clone();\n            move || {\n                barrier.wait();\n\n                loop {\n                    for i in 0..(16 * 16 * 8) {\n                        let major = i / (16 * 8);\n                        let minor = i % 16;\n\n                        let mut base = INDELIBLE[major].to_vec();\n                        base.push(minor as u8);\n                        t.insert(base.clone(), base.clone()).unwrap();\n                    }\n                }\n            }\n        })\n        .unwrap();\n\n    threads.push(inserter);\n\n    let deleter = thread::Builder::new()\n        .name(\"deleter\".into())\n        .spawn({\n            move || {\n                barrier.wait();\n\n                loop {\n                    for i in 0..(16 * 16 * 8) {\n                        let major = i / (16 * 8);\n                        let minor = i % 16;\n\n                        let mut base = INDELIBLE[major].to_vec();\n                        base.push(minor as u8);\n                        t.remove(&base).unwrap();\n                    }\n                }\n            }\n        })\n        .unwrap();\n\n    spawn_killah();\n\n    threads.push(deleter);\n\n    for thread in threads.into_iter() {\n        thread.join().expect(\"thread should not have crashed\");\n    }\n}\n"
  },
  {
    "path": "tests/crash_tests/crash_metadata_store.rs",
    "content": "use super::*;\n\npub fn run_crash_metadata_store() {\n    let (metadata_store, recovered) =\n        MetadataStore::recover(&HEAP_DIR).unwrap();\n\n    // validate\n\n    spawn_killah();\n\n    loop {}\n}\n"
  },
  {
    "path": "tests/crash_tests/crash_object_cache.rs",
    "content": "use super::*;\n\nconst FANOUT: usize = 3;\n\npub fn run_crash_object_cache() {\n    let path = std::path::Path::new(CRASH_DIR).join(OBJECT_CACHE_DIR);\n    let config = Config::new().flush_every_ms(Some(1)).path(path);\n\n    let (oc, collections, was_recovered): (ObjectCache<FANOUT>, _, bool) =\n        ObjectCache::recover(&config).unwrap();\n\n    // validate\n\n    spawn_killah();\n\n    loop {}\n}\n"
  },
  {
    "path": "tests/crash_tests/crash_sequential_writes.rs",
    "content": "use std::thread;\n\nuse super::*;\n\nconst CACHE_SIZE: usize = 1024 * 1024;\nconst CYCLE: usize = 256;\nconst SEGMENT_SIZE: usize = 1024;\n\n/// Verifies that the keys in the tree are correctly recovered.\n/// Panics if they are incorrect.\n/// Returns the key that should be resumed at, and the current cycle value.\nfn verify(tree: &Db) -> (u32, u32) {\n    // key 0 should always be the highest value, as that's where we increment\n    // at some point, it might go down by one\n    // it should never return, or go down again after that\n    let mut iter = tree.iter();\n    let highest = match iter.next() {\n        Some(Ok((_k, v))) => slice_to_u32(&*v),\n        Some(Err(e)) => panic!(\"{:?}\", e),\n        None => return (0, 0),\n    };\n\n    let highest_vec = u32_to_vec(highest);\n\n    // find how far we got\n    let mut contiguous: u32 = 0;\n    let mut lowest_with_high_value = 0;\n\n    for res in iter {\n        let (k, v) = res.unwrap();\n        if v[..4] == highest_vec[..4] {\n            contiguous += 1;\n        } else {\n            let expected = if highest == 0 {\n                CYCLE as u32 - 1\n            } else {\n                (highest - 1) % CYCLE as u32\n            };\n            let actual = slice_to_u32(&*v);\n            // FIXME BUG 2\n            // thread '<unnamed>' panicked at tests/test_crash_recovery.rs:159:13:\n            // assertion `left == right` failed\n            //   left: 139\n            //  right: 136\n            assert_eq!(\n                expected,\n                actual,\n                \"tree failed assertion with iterated values: {}, k: {:?} v: {:?} expected: {} highest: {}\",\n                tree_to_string(&tree),\n                k,\n                v,\n                expected,\n                highest\n            );\n            lowest_with_high_value = actual;\n            break;\n        }\n    }\n\n    // ensure nothing changes after this point\n    let low_beginning = u32_to_vec(contiguous + 1);\n\n    for res in tree.range(&*low_beginning..) {\n        let (k, v): (sled::InlineArray, _) = res.unwrap();\n        assert_eq!(\n            slice_to_u32(&*v),\n            lowest_with_high_value,\n            \"expected key {} to have value {}, instead it had value {} in db: {:?}\",\n            slice_to_u32(&*k),\n            lowest_with_high_value,\n            slice_to_u32(&*v),\n            tree\n        );\n    }\n\n    (contiguous, highest)\n}\n\nfn run_inner(config: Config) {\n    let crash_during_initialization = rand::rng().random_bool(0.1);\n\n    if crash_during_initialization {\n        spawn_killah();\n    }\n\n    let tree = config.open().expect(\"couldn't open db\");\n\n    if !crash_during_initialization {\n        spawn_killah();\n    }\n\n    let (key, highest) = verify(&tree);\n\n    let mut hu = ((highest as usize) * CYCLE) + key as usize;\n    assert_eq!(hu % CYCLE, key as usize);\n    assert_eq!(hu / CYCLE, highest as usize);\n\n    loop {\n        let key = u32_to_vec((hu % CYCLE) as u32);\n\n        //dbg!(hu, hu % CYCLE);\n\n        let mut value = u32_to_vec((hu / CYCLE) as u32);\n        let additional_len = rand::rng().random_range(0..SEGMENT_SIZE / 3);\n        value.append(&mut vec![0u8; additional_len]);\n\n        tree.insert(&key, value).unwrap();\n\n        hu += 1;\n\n        if hu / CYCLE >= CYCLE {\n            hu = 0;\n        }\n    }\n}\n\npub fn run_crash_sequential_writes() {\n    let path = std::path::Path::new(CRASH_DIR).join(SEQUENTIAL_WRITES_DIR);\n    let config = Config::new()\n        .cache_capacity_bytes(CACHE_SIZE)\n        .flush_every_ms(Some(1))\n        .path(path);\n\n    if let Err(e) = thread::spawn(|| run_inner(config)).join() {\n        println!(\"worker thread failed: {:?}\", e);\n        std::process::exit(15);\n    }\n}\n"
  },
  {
    "path": "tests/crash_tests/crash_tx.rs",
    "content": "use super::*;\n\nconst CACHE_SIZE: usize = 1024 * 1024;\n\npub fn run_crash_tx() {\n    let config = Config::new()\n        .cache_capacity_bytes(CACHE_SIZE)\n        .flush_every_ms(Some(1))\n        .path(TX_DIR);\n\n    let _db: Db = config.open().unwrap();\n\n    spawn_killah();\n\n    loop {}\n\n    /*\n        db.insert(b\"k1\", b\"cats\").unwrap();\n        db.insert(b\"k2\", b\"dogs\").unwrap();\n        db.insert(b\"id\", &0_u64.to_le_bytes()).unwrap();\n\n        let mut threads = vec![];\n\n        const N_WRITERS: usize = 50;\n        const N_READERS: usize = 5;\n\n        let barrier = Arc::new(Barrier::new(N_WRITERS + N_READERS));\n\n        for _ in 0..N_WRITERS {\n            let db = db.clone();\n            let barrier = barrier.clone();\n            let thread = std::thread::spawn(move || {\n                barrier.wait();\n                loop {\n                    db.transaction::<_, _, ()>(|db| {\n                        let v1 = db.remove(b\"k1\").unwrap().unwrap();\n                        let v2 = db.remove(b\"k2\").unwrap().unwrap();\n\n                        db.insert(b\"id\", &db.generate_id().unwrap().to_le_bytes())\n                            .unwrap();\n\n                        db.insert(b\"k1\", v2).unwrap();\n                        db.insert(b\"k2\", v1).unwrap();\n                        Ok(())\n                    })\n                    .unwrap();\n                }\n            });\n            threads.push(thread);\n        }\n\n        for _ in 0..N_READERS {\n            let db = db.clone();\n            let barrier = barrier.clone();\n            let thread = std::thread::spawn(move || {\n                barrier.wait();\n                let mut last_id = 0;\n                loop {\n                    let read_id = db\n                        .transaction::<_, _, ()>(|db| {\n                            let v1 = db.get(b\"k1\").unwrap().unwrap();\n                            let v2 = db.get(b\"k2\").unwrap().unwrap();\n                            let id = u64::from_le_bytes(\n                                TryFrom::try_from(\n                                    &*db.get(b\"id\").unwrap().unwrap(),\n                                )\n                                .unwrap(),\n                            );\n\n                            let mut results = vec![v1, v2];\n                            results.sort();\n\n                            assert_eq!(\n                                [&results[0], &results[1]],\n                                [b\"cats\", b\"dogs\"]\n                            );\n\n                            Ok(id)\n                        })\n                        .unwrap();\n                    assert!(read_id >= last_id);\n                    last_id = read_id;\n                }\n            });\n            threads.push(thread);\n        }\n\n        spawn_killah();\n\n        for thread in threads.into_iter() {\n            thread.join().expect(\"threads should not crash\");\n        }\n\n        let v1 = db.get(b\"k1\").unwrap().unwrap();\n        let v2 = db.get(b\"k2\").unwrap().unwrap();\n        assert_eq!([v1, v2], [b\"cats\", b\"dogs\"]);\n    */\n}\n"
  },
  {
    "path": "tests/crash_tests/mod.rs",
    "content": "use std::mem::size_of;\nuse std::process::exit;\nuse std::thread;\nuse std::time::Duration;\n\nuse rand::Rng;\n\nuse sled::{\n    Config, Db as SledDb, Heap, HeapRecovery, MetadataStore, ObjectCache,\n};\n\nmod crash_batches;\nmod crash_heap;\nmod crash_iter;\nmod crash_metadata_store;\nmod crash_object_cache;\nmod crash_sequential_writes;\nmod crash_tx;\n\npub use crash_batches::run_crash_batches;\npub use crash_heap::run_crash_heap;\npub use crash_iter::run_crash_iter;\npub use crash_metadata_store::run_crash_metadata_store;\npub use crash_object_cache::run_crash_object_cache;\npub use crash_sequential_writes::run_crash_sequential_writes;\npub use crash_tx::run_crash_tx;\n\ntype Db = SledDb<8>;\n\n// test names, also used as dir names\npub const SEQUENTIAL_WRITES_DIR: &str = \"crash_sequential_writes\";\npub const BATCHES_DIR: &str = \"crash_batches\";\npub const ITER_DIR: &str = \"crash_iter\";\npub const TX_DIR: &str = \"crash_tx\";\npub const METADATA_STORE_DIR: &str = \"crash_metadata_store\";\npub const HEAP_DIR: &str = \"crash_heap\";\npub const OBJECT_CACHE_DIR: &str = \"crash_object_cache\";\n\nconst CRASH_DIR: &str = \"crash_test_files\";\n\nfn spawn_killah() {\n    thread::spawn(|| {\n        let runtime = rand::rng().random_range(0..60_000);\n        thread::sleep(Duration::from_micros(runtime));\n        exit(9);\n    });\n}\n\nfn u32_to_vec(u: u32) -> Vec<u8> {\n    let buf: [u8; size_of::<u32>()] = u.to_be_bytes();\n    buf.to_vec()\n}\n\nfn slice_to_u32(b: &[u8]) -> u32 {\n    let mut buf = [0u8; size_of::<u32>()];\n    buf.copy_from_slice(&b[..size_of::<u32>()]);\n\n    u32::from_be_bytes(buf)\n}\n\nfn tree_to_string(tree: &Db) -> String {\n    let mut ret = String::from(\"{\");\n    for kv_res in tree.iter() {\n        let (k, v) = kv_res.unwrap();\n        let k_s = slice_to_u32(&k);\n        let v_s = slice_to_u32(&v);\n        ret.push_str(&format!(\"{}:{}, \", k_s, v_s));\n    }\n    ret.push_str(\"}\");\n    ret\n}\n"
  },
  {
    "path": "tests/test_crash_recovery.rs",
    "content": "mod common;\nmod crash_tests;\n\nuse std::alloc::{Layout, System};\nuse std::env::{self, VarError};\nuse std::process::Command;\nuse std::thread;\n\nuse common::cleanup;\n\nconst TEST_ENV_VAR: &str = \"SLED_CRASH_TEST\";\nconst N_TESTS: usize = 100;\n\nconst TESTS: [&str; 7] = [\n    crash_tests::SEQUENTIAL_WRITES_DIR,\n    crash_tests::BATCHES_DIR,\n    crash_tests::ITER_DIR,\n    crash_tests::TX_DIR,\n    crash_tests::METADATA_STORE_DIR,\n    crash_tests::HEAP_DIR,\n    crash_tests::OBJECT_CACHE_DIR,\n];\n\nconst CRASH_CHANCE: u32 = 250;\n\n#[global_allocator]\nstatic ALLOCATOR: ShredAllocator = ShredAllocator;\n\n#[derive(Default, Debug, Clone, Copy)]\nstruct ShredAllocator;\n\nunsafe impl std::alloc::GlobalAlloc for ShredAllocator {\n    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {\n        assert!(layout.size() < 1_000_000_000);\n        let ret = unsafe { System.alloc(layout) };\n        assert_ne!(ret, std::ptr::null_mut());\n        unsafe {\n            std::ptr::write_bytes(ret, 0xa1, layout.size());\n        }\n        ret\n    }\n\n    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {\n        unsafe {\n            std::ptr::write_bytes(ptr, 0xde, layout.size());\n        }\n        unsafe { System.dealloc(ptr, layout) }\n    }\n}\n\nfn main() {\n    // Don't actually run this harness=false test under miri, as it requires\n    // spawning and killing child processes.\n    if cfg!(miri) {\n        return;\n    }\n\n    common::setup_logger();\n\n    match env::var(TEST_ENV_VAR) {\n        Err(VarError::NotPresent) => {\n            let filtered: Vec<&'static str> =\n                if let Some(filter) = std::env::args().nth(1) {\n                    TESTS\n                        .iter()\n                        .filter(|name| name.contains(&filter))\n                        .cloned()\n                        .collect()\n                } else {\n                    TESTS.to_vec()\n                };\n\n            let filtered_len = filtered.len();\n\n            println!();\n            println!(\n                \"running {} test{}\",\n                filtered.len(),\n                if filtered.len() == 1 { \"\" } else { \"s\" },\n            );\n\n            let mut tests = vec![];\n            for test_name in filtered.into_iter() {\n                let test = thread::spawn(move || {\n                    let res =\n                        std::panic::catch_unwind(|| supervisor(test_name));\n                    println!(\n                        \"test {} ... {}\",\n                        test_name,\n                        if res.is_ok() { \"ok\" } else { \"panicked\" }\n                    );\n                    res.unwrap();\n                });\n                tests.push((test_name, test));\n            }\n\n            for (test_name, test) in tests.into_iter() {\n                test.join().expect(test_name);\n            }\n\n            println!();\n            println!(\n                \"test result: ok. {} passed; {} filtered out\",\n                filtered_len,\n                TESTS.len() - filtered_len,\n            );\n            println!();\n        }\n\n        Ok(ref s) if s == crash_tests::SEQUENTIAL_WRITES_DIR => {\n            crash_tests::run_crash_sequential_writes()\n        }\n        Ok(ref s) if s == crash_tests::BATCHES_DIR => {\n            crash_tests::run_crash_batches()\n        }\n        Ok(ref s) if s == crash_tests::ITER_DIR => {\n            crash_tests::run_crash_iter()\n        }\n        Ok(ref s) if s == crash_tests::TX_DIR => crash_tests::run_crash_tx(),\n        Ok(ref s) if s == crash_tests::METADATA_STORE_DIR => {\n            crash_tests::run_crash_metadata_store()\n        }\n        Ok(ref s) if s == crash_tests::HEAP_DIR => {\n            crash_tests::run_crash_heap()\n        }\n        Ok(ref s) if s == crash_tests::OBJECT_CACHE_DIR => {\n            crash_tests::run_crash_object_cache()\n        }\n        Ok(other) => panic!(\"invalid crash test case: {other}\"),\n        Err(e) => panic!(\"env var {TEST_ENV_VAR} unable to be read: {e:?}\"),\n    }\n}\n\nfn run_child_process(dir: &str) {\n    let bin = env::current_exe().expect(\"could not get test binary path\");\n\n    unsafe {\n        env::set_var(TEST_ENV_VAR, dir);\n    }\n\n    let status_res = Command::new(bin)\n        .env(TEST_ENV_VAR, dir)\n        .env(\"SLED_CRASH_CHANCE\", CRASH_CHANCE.to_string())\n        .spawn()\n        .unwrap_or_else(|_| {\n            panic!(\"could not spawn child process for {} test\", dir)\n        })\n        .wait();\n\n    match status_res {\n        Ok(status) => {\n            let code = status.code();\n\n            if code.is_none() || code.unwrap() != 9 {\n                cleanup(dir);\n                panic!(\"{} test child exited abnormally\", dir);\n            }\n        }\n        Err(e) => {\n            cleanup(dir);\n            panic!(\"error waiting for {} test child: {}\", dir, e);\n        }\n    }\n}\n\nfn supervisor(dir: &str) {\n    cleanup(dir);\n\n    for _ in 0..N_TESTS {\n        run_child_process(dir);\n    }\n\n    cleanup(dir);\n}\n"
  },
  {
    "path": "tests/test_quiescent.rs",
    "content": "#![cfg(all(target_os = \"linux\", not(miri)))]\n\nmod common;\n\nuse std::time::{Duration, Instant};\n\nuse common::cleanup;\n\n#[test]\nfn quiescent_cpu_time() {\n    const DB_DIR: &str = \"sleeper\";\n    cleanup(DB_DIR);\n\n    fn run() {\n        let start = Instant::now();\n        let db = sled::open(DB_DIR).unwrap();\n        std::thread::sleep(Duration::from_secs(10));\n        drop(db);\n        let end = Instant::now();\n\n        let (user_cpu_time, system_cpu_time) = unsafe {\n            let mut resource_usage: libc::rusage = std::mem::zeroed();\n            let return_value = libc::getrusage(\n                libc::RUSAGE_SELF,\n                (&mut resource_usage) as *mut libc::rusage,\n            );\n            if return_value != 0 {\n                panic!(\"error {} from getrusage()\", *libc::__errno_location());\n            }\n            (resource_usage.ru_utime, resource_usage.ru_stime)\n        };\n\n        let user_cpu_seconds =\n            user_cpu_time.tv_sec as f64 + user_cpu_time.tv_usec as f64 * 1e-6;\n        let system_cpu_seconds = system_cpu_time.tv_sec as f64\n            + system_cpu_time.tv_usec as f64 * 1e-6;\n        let real_time_elapsed = end.duration_since(start);\n\n        if user_cpu_seconds + system_cpu_seconds > 1.0 {\n            panic!(\n                \"Database used too much CPU during a quiescent workload. User: {}s, system: {}s (wall clock: {}s)\",\n                user_cpu_seconds,\n                system_cpu_seconds,\n                real_time_elapsed.as_secs_f64(),\n            );\n        }\n    }\n\n    let child = unsafe { libc::fork() };\n    if child == 0 {\n        common::setup_logger();\n        if let Err(e) = std::thread::spawn(run).join() {\n            println!(\"test failed: {:?}\", e);\n            std::process::exit(15);\n        } else {\n            std::process::exit(0);\n        }\n    } else {\n        let mut status = 0;\n        unsafe {\n            libc::waitpid(child, &mut status as *mut libc::c_int, 0);\n        }\n        if status != 0 {\n            cleanup(DB_DIR);\n            panic!(\"child exited abnormally\");\n        }\n    }\n\n    cleanup(DB_DIR);\n}\n"
  },
  {
    "path": "tests/test_space_leaks.rs",
    "content": "use std::io;\n\nmod common;\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn size_leak() -> io::Result<()> {\n    common::setup_logger();\n\n    let tree: sled::Db<1024> =\n        sled::Config::tmp()?.flush_every_ms(None).open()?;\n\n    for _ in 0..10_000 {\n        tree.insert(b\"\", b\"\")?;\n    }\n\n    tree.flush()?;\n\n    let sz = tree.size_on_disk()?;\n    assert!(\n        sz <= 16384,\n        \"expected system to use less than or equal to \\\n            16486 bytes, but actually used {}\",\n        sz\n    );\n\n    Ok(())\n}\n"
  },
  {
    "path": "tests/test_tree.rs",
    "content": "mod common;\nmod tree;\n\nuse std::{\n    io,\n    sync::{\n        Arc, Barrier,\n        atomic::{AtomicBool, AtomicUsize, Ordering::SeqCst},\n    },\n};\n\n#[allow(unused_imports)]\nuse log::{debug, warn};\n\nuse quickcheck::{Gen, QuickCheck};\n\n// use sled::Transactional;\n// use sled::transaction::*;\nuse sled::{Config, Db as SledDb, InlineArray};\n\ntype Db = SledDb<3>;\n\nuse tree::{\n    Op::{self},\n    prop_tree_matches_btreemap,\n};\n\nconst N_THREADS: usize = 32;\nconst N_PER_THREAD: usize = 10_000;\nconst N: usize = N_THREADS * N_PER_THREAD; // NB N should be multiple of N_THREADS\nconst SPACE: usize = N;\n\n#[allow(dead_code)]\nconst INTENSITY: usize = 1;\n\nfn kv(i: usize) -> InlineArray {\n    let i = i % SPACE;\n    let k = [(i >> 16) as u8, (i >> 8) as u8, i as u8];\n    (&k).into()\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn monotonic_inserts() {\n    common::setup_logger();\n\n    let db: Db = Config::tmp().unwrap().flush_every_ms(None).open().unwrap();\n\n    for len in [1_usize, 16, 32, 1024].iter() {\n        for i in 0_usize..*len {\n            let mut k = vec![];\n            for c in 0_usize..i {\n                k.push((c % 256) as u8);\n            }\n            db.insert(&k, &[]).unwrap();\n        }\n\n        let count = db.iter().count();\n        assert_eq!(count, *len as usize);\n\n        let count2 = db.iter().rev().count();\n        assert_eq!(count2, *len as usize);\n\n        db.clear().unwrap();\n    }\n\n    for len in [1_usize, 16, 32, 1024].iter() {\n        for i in (0_usize..*len).rev() {\n            let mut k = vec![];\n            for c in (0_usize..i).rev() {\n                k.push((c % 256) as u8);\n            }\n            db.insert(&k, &[]).unwrap();\n        }\n\n        let count3 = db.iter().count();\n        assert_eq!(count3, *len as usize);\n\n        let count4 = db.iter().rev().count();\n        assert_eq!(count4, *len as usize);\n\n        db.clear().unwrap();\n    }\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn fixed_stride_inserts() {\n    // this is intended to test the fixed stride key omission optimization\n    common::setup_logger();\n\n    let db: Db = Config::tmp().unwrap().flush_every_ms(None).open().unwrap();\n\n    let mut expected = std::collections::HashSet::new();\n    for k in 0..4096_u16 {\n        db.insert(&k.to_be_bytes(), &[]).unwrap();\n        expected.insert(k.to_be_bytes().to_vec());\n    }\n\n    let mut count = 0_u16;\n    for kvr in db.iter() {\n        let (k, _) = kvr.unwrap();\n        assert_eq!(&k, &count.to_be_bytes());\n        count += 1;\n    }\n    assert_eq!(count, 4096, \"tree: {:?}\", db);\n    assert_eq!(db.len().unwrap(), 4096);\n\n    let count = db.iter().rev().count();\n    assert_eq!(count, 4096);\n\n    for k in 0..4096_u16 {\n        db.insert(&k.to_be_bytes(), &[1]).unwrap();\n    }\n\n    let count = db.iter().count();\n    assert_eq!(count, 4096);\n\n    let count = db.iter().rev().count();\n    assert_eq!(count, 4096);\n    assert_eq!(db.len().unwrap(), 4096);\n\n    for k in 0..4096_u16 {\n        db.remove(&k.to_be_bytes()).unwrap();\n    }\n\n    let count = db.iter().count();\n    assert_eq!(count, 0);\n\n    let count = db.iter().rev().count();\n    assert_eq!(count, 0);\n    assert_eq!(db.len().unwrap(), 0);\n    assert!(db.is_empty().unwrap());\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn sequential_inserts() {\n    common::setup_logger();\n\n    let db: Db = Config::tmp().unwrap().flush_every_ms(None).open().unwrap();\n\n    for len in [1, 16, 32, u16::MAX].iter() {\n        for i in 0..*len {\n            db.insert(&i.to_le_bytes(), &[]).unwrap();\n        }\n\n        let count = db.iter().count();\n        assert_eq!(count, *len as usize);\n\n        let count2 = db.iter().rev().count();\n        assert_eq!(count2, *len as usize);\n    }\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn reverse_inserts() {\n    common::setup_logger();\n\n    let db: Db = Config::tmp().unwrap().flush_every_ms(None).open().unwrap();\n\n    for len in [1, 16, 32, u16::MAX].iter() {\n        for i in 0..*len {\n            let i2 = u16::MAX - i;\n            db.insert(&i2.to_le_bytes(), &[]).unwrap();\n        }\n\n        let count = db.iter().count();\n        assert_eq!(count, *len as usize);\n\n        let count2 = db.iter().rev().count();\n        assert_eq!(count2, *len as usize);\n    }\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn very_large_reverse_tree_iterator() {\n    let mut a = vec![255; 1024 * 1024];\n    a.push(0);\n    let mut b = vec![255; 1024 * 1024];\n    b.push(1);\n\n    let db: Db = Config::tmp().unwrap().flush_every_ms(Some(1)).open().unwrap();\n\n    db.insert(a, \"\").unwrap();\n    db.insert(b, \"\").unwrap();\n\n    assert_eq!(db.iter().rev().count(), 2);\n}\n\n#[test]\n#[cfg(all(target_os = \"linux\", not(miri)))]\nfn varied_compression_ratios() {\n    // tests for the compression issue reported in #938 by @Mrmaxmeier.\n\n    let low_entropy = vec![0u8; 64 << 10]; // 64k zeroes\n    let high_entropy = {\n        // 64mb random\n        use std::fs::File;\n        use std::io::Read;\n        let mut buf = vec![0u8; 64 << 20];\n        File::open(\"/dev/urandom\").unwrap().read_exact(&mut buf).unwrap();\n        buf\n    };\n\n    let tree: Db =\n        Config::default().path(\"compression_db_test\").open().unwrap();\n\n    tree.insert(b\"low  entropy\", &low_entropy[..]).unwrap();\n    tree.insert(b\"high entropy\", &high_entropy[..]).unwrap();\n\n    println!(\"reloading database...\");\n    drop(tree);\n    let tree: Db =\n        Config::default().path(\"compression_db_test\").open().unwrap();\n    drop(tree);\n\n    let _ = std::fs::remove_dir_all(\"compression_db_test\");\n}\n\n#[test]\nfn test_pop_first() -> io::Result<()> {\n    let config = sled::Config::tmp().unwrap();\n    let db: sled::Db<4> = config.open()?;\n    db.insert(&[0], vec![0])?;\n    db.insert(&[1], vec![10])?;\n    db.insert(&[2], vec![20])?;\n    db.insert(&[3], vec![30])?;\n    db.insert(&[4], vec![40])?;\n    db.insert(&[5], vec![50])?;\n\n    assert_eq!(&db.pop_first()?.unwrap().0, &[0]);\n    assert_eq!(&db.pop_first()?.unwrap().0, &[1]);\n    assert_eq!(&db.pop_first()?.unwrap().0, &[2]);\n    assert_eq!(&db.pop_first()?.unwrap().0, &[3]);\n    assert_eq!(&db.pop_first()?.unwrap().0, &[4]);\n    assert_eq!(&db.pop_first()?.unwrap().0, &[5]);\n    assert_eq!(db.pop_first()?, None);\n    /*\n     */\n    Ok(())\n}\n\n#[test]\nfn test_pop_last_in_range() -> io::Result<()> {\n    let config = sled::Config::tmp().unwrap();\n    let db: sled::Db<4> = config.open()?;\n\n    let data = vec![\n        (b\"key 1\", b\"value 1\"),\n        (b\"key 2\", b\"value 2\"),\n        (b\"key 3\", b\"value 3\"),\n    ];\n\n    for (k, v) in data {\n        db.insert(k, v).unwrap();\n    }\n\n    let r1 = db.pop_last_in_range(b\"key 1\".as_ref()..=b\"key 3\").unwrap();\n    assert_eq!(Some((b\"key 3\".into(), b\"value 3\".into())), r1);\n\n    let r2 = db.pop_last_in_range(b\"key 1\".as_ref()..b\"key 3\").unwrap();\n    assert_eq!(Some((b\"key 2\".into(), b\"value 2\".into())), r2);\n\n    let r3 = db.pop_last_in_range(b\"key 4\".as_ref()..).unwrap();\n    assert!(r3.is_none());\n\n    let r4 = db.pop_last_in_range(b\"key 2\".as_ref()..=b\"key 3\").unwrap();\n    assert!(r4.is_none());\n\n    let r5 = db.pop_last_in_range(b\"key 0\".as_ref()..=b\"key 3\").unwrap();\n    assert_eq!(Some((b\"key 1\".into(), b\"value 1\".into())), r5);\n\n    let r6 = db.pop_last_in_range(b\"key 0\".as_ref()..=b\"key 3\").unwrap();\n    assert!(r6.is_none());\n    Ok(())\n}\n\n#[test]\nfn test_interleaved_gets_sets() {\n    common::setup_logger();\n    let db: Db =\n        Config::tmp().unwrap().cache_capacity_bytes(1024).open().unwrap();\n\n    let done = Arc::new(AtomicBool::new(false));\n\n    std::thread::scope(|scope| {\n        let db_2 = db.clone();\n        let done = &done;\n        scope.spawn(move || {\n            for v in 0..500_000_u32 {\n                db_2.insert(v.to_be_bytes(), &[42u8; 4096][..])\n                    .expect(\"failed to insert\");\n                if v % 10_000 == 0 {\n                    log::trace!(\"WRITING: {}\", v);\n                    db_2.flush().unwrap();\n                }\n            }\n            done.store(true, SeqCst);\n        });\n        scope.spawn(move || {\n            while !done.load(SeqCst) {\n                for v in (0..500_000_u32).rev() {\n                    db.get(v.to_be_bytes()).expect(\"Fatal error?\");\n                    if v % 10_000 == 0 {\n                        log::trace!(\"READING: {}\", v)\n                    }\n                }\n            }\n        });\n    });\n}\n\n#[test]\n#[cfg(not(miri))] // can't create threads\nfn concurrent_tree_pops() -> std::io::Result<()> {\n    use std::thread;\n\n    let db: Db = Config::tmp().unwrap().open()?;\n\n    // Insert values 0..5\n    for x in 0u32..5 {\n        db.insert(x.to_be_bytes(), &[])?;\n    }\n\n    let mut threads = vec![];\n\n    // Pop 5 values using multiple threads\n    let barrier = Arc::new(Barrier::new(5));\n    for _ in 0..5 {\n        let barrier = barrier.clone();\n        let db: Db = db.clone();\n        threads.push(thread::spawn(move || {\n            barrier.wait();\n            db.pop_first().unwrap().unwrap();\n        }));\n    }\n\n    for thread in threads.into_iter() {\n        thread.join().unwrap();\n    }\n\n    assert!(\n        db.is_empty().unwrap(),\n        \"elements left in database: {:?}\",\n        db.iter().collect::<Vec<_>>()\n    );\n\n    Ok(())\n}\n\n#[test]\n#[cfg(not(miri))] // can't create threads\nfn concurrent_tree_ops() {\n    use std::thread;\n\n    common::setup_logger();\n\n    for i in 0..INTENSITY {\n        debug!(\"beginning test {}\", i);\n\n        let config = Config::tmp()\n            .unwrap()\n            .flush_every_ms(Some(1))\n            .cache_capacity_bytes(1024);\n\n        macro_rules! par {\n            ($t:ident, $f:expr) => {\n                let mut threads = vec![];\n\n                let flusher_barrier = Arc::new(Barrier::new(N_THREADS));\n                for tn in 0..N_THREADS {\n                    let tree = $t.clone();\n                    let barrier = flusher_barrier.clone();\n                    let thread = thread::Builder::new()\n                        .name(format!(\"t(thread: {} flusher)\", tn))\n                        .spawn(move || {\n                            tree.flush().unwrap();\n                            barrier.wait();\n                        })\n                        .expect(\"should be able to spawn thread\");\n                    threads.push(thread);\n                }\n\n                let barrier = Arc::new(Barrier::new(N_THREADS));\n\n                for tn in 0..N_THREADS {\n                    let tree = $t.clone();\n                    let barrier = barrier.clone();\n                    let thread = thread::Builder::new()\n                        .name(format!(\"t(thread: {} test: {})\", tn, i))\n                        .spawn(move || {\n                            barrier.wait();\n                            for i in\n                                (tn * N_PER_THREAD)..((tn + 1) * N_PER_THREAD)\n                            {\n                                let k = kv(i);\n                                $f(&tree, k);\n                            }\n                        })\n                        .expect(\"should be able to spawn thread\");\n                    threads.push(thread);\n                }\n                while let Some(thread) = threads.pop() {\n                    if let Err(e) = thread.join() {\n                        panic!(\"thread failure: {:?}\", e);\n                    }\n                }\n            };\n        }\n\n        debug!(\"========== initial sets test {} ==========\", i);\n        let t: Db = config.open().unwrap();\n        par! {t, move |tree: &Db, k: InlineArray| {\n            assert_eq!(tree.get(&*k).unwrap(), None);\n            tree.insert(&k, k.clone()).expect(\"we should write successfully\");\n            assert_eq!(tree.get(&*k).unwrap(), Some(k.clone().into()),\n                \"failed to read key {:?} that we just wrote from tree {:?}\",\n                k, tree);\n        }};\n\n        let n_scanned = t.iter().count();\n        if n_scanned != N {\n            warn!(\n                \"WARNING: test {} only had {} keys present \\\n                 in the DB BEFORE restarting. expected {}\",\n                i, n_scanned, N,\n            );\n        }\n\n        drop(t);\n        let t: Db = config.open().expect(\"should be able to restart Db\");\n\n        let n_scanned = t.iter().count();\n        if n_scanned != N {\n            warn!(\n                \"WARNING: test {} only had {} keys present \\\n                 in the DB AFTER restarting. expected {}\",\n                i, n_scanned, N,\n            );\n        }\n\n        debug!(\"========== reading sets in test {} ==========\", i);\n        par! {t, move |tree: &Db, k: InlineArray| {\n            if let Some(v) =  tree.get(&*k).unwrap() {\n                if v != k {\n                    panic!(\"expected key {:?} not found\", k);\n                }\n            } else {\n                panic!(\n                    \"could not read key {:?}, which we \\\n                    just wrote to tree {:?}\", k, tree\n               );\n            }\n        }};\n\n        drop(t);\n        let t: Db = config.open().expect(\"should be able to restart Db\");\n\n        debug!(\"========== CAS test in test {} ==========\", i);\n        par! {t, move |tree: &Db, k: InlineArray| {\n            let k1 = k.clone();\n            let mut k2 = k;\n            k2.make_mut().reverse();\n            tree.compare_and_swap(&k1, Some(&*k1), Some(k2)).unwrap().unwrap();\n        }};\n\n        drop(t);\n        let t: Db = config.open().expect(\"should be able to restart Db\");\n\n        par! {t, move |tree: &Db, k: InlineArray| {\n            let k1 = k.clone();\n            let mut k2 = k;\n            k2.make_mut().reverse();\n            assert_eq!(tree.get(&*k1).unwrap().unwrap(), k2);\n        }};\n\n        drop(t);\n        let t: Db = config.open().expect(\"should be able to restart Db\");\n\n        debug!(\"========== deleting in test {} ==========\", i);\n        par! {t, move |tree: &Db, k: InlineArray| {\n            tree.remove(&*k).unwrap().unwrap();\n        }};\n\n        drop(t);\n        let t: Db = config.open().expect(\"should be able to restart Db\");\n\n        par! {t, move |tree: &Db, k: InlineArray| {\n            assert_eq!(tree.get(&*k).unwrap(), None);\n        }};\n    }\n}\n\n#[test]\n#[cfg(not(miri))] // can't create threads\nfn concurrent_tree_iter() -> io::Result<()> {\n    use std::sync::Barrier;\n    use std::thread;\n\n    common::setup_logger();\n\n    const N_FORWARD: usize = INTENSITY;\n    const N_REVERSE: usize = INTENSITY;\n    const N_INSERT: usize = INTENSITY;\n    const N_DELETE: usize = INTENSITY;\n    const N_FLUSHERS: usize = N_THREADS;\n\n    // items that are expected to always be present at their expected\n    // order, regardless of other inserts or deletes.\n    const INDELIBLE: [&[u8]; 16] = [\n        &[0u8],\n        &[1u8],\n        &[2u8],\n        &[3u8],\n        &[4u8],\n        &[5u8],\n        &[6u8],\n        &[7u8],\n        &[8u8],\n        &[9u8],\n        &[10u8],\n        &[11u8],\n        &[12u8],\n        &[13u8],\n        &[14u8],\n        &[15u8],\n    ];\n\n    let config = Config::tmp()\n        .unwrap()\n        .cache_capacity_bytes(1024 * 1024 * 1024)\n        .flush_every_ms(Some(1));\n\n    let t: Db = config.open().unwrap();\n\n    let mut threads: Vec<thread::JoinHandle<io::Result<()>>> = vec![];\n\n    for tn in 0..N_FLUSHERS {\n        let tree = t.clone();\n        let thread = thread::Builder::new()\n            .name(format!(\"t(thread: {} flusher)\", tn))\n            .spawn(move || {\n                tree.flush().unwrap();\n                Ok(())\n            })\n            .expect(\"should be able to spawn thread\");\n        threads.push(thread);\n    }\n\n    for item in &INDELIBLE {\n        t.insert(item, item.to_vec())?;\n    }\n\n    let barrier =\n        Arc::new(Barrier::new(N_FORWARD + N_REVERSE + N_INSERT + N_DELETE));\n\n    static I: AtomicUsize = AtomicUsize::new(0);\n\n    for i in 0..N_FORWARD {\n        let t: Db = t.clone();\n        let barrier = barrier.clone();\n\n        let thread = thread::Builder::new()\n            .name(format!(\"forward({})\", i))\n            .spawn(move || {\n                I.fetch_add(1, SeqCst);\n                barrier.wait();\n                for _ in 0..1024 {\n                    let expected = INDELIBLE.iter();\n                    let mut keys = t.iter().keys();\n\n                    for expect in expected {\n                        loop {\n                            let k = keys.next().unwrap()?;\n                            assert!(\n                                &*k <= *expect,\n                                \"witnessed key is {:?} but we expected \\\n                                one <= {:?}, so we overshot due to a \\\n                                concurrent modification\",\n                                k,\n                                expect,\n                            );\n                            if &*k == *expect {\n                                break;\n                            }\n                        }\n                    }\n                }\n                I.fetch_sub(1, SeqCst);\n\n                Ok(())\n            })\n            .unwrap();\n        threads.push(thread);\n    }\n\n    for i in 0..N_REVERSE {\n        let t: Db = t.clone();\n        let barrier = barrier.clone();\n\n        let thread = thread::Builder::new()\n            .name(format!(\"reverse({})\", i))\n            .spawn(move || {\n                I.fetch_add(1, SeqCst);\n                barrier.wait();\n                for _ in 0..1024 {\n                    let expected = INDELIBLE.iter().rev();\n                    let mut keys = t.iter().keys().rev();\n\n                    for expect in expected {\n                        loop {\n                            if let Some(Ok(k)) = keys.next() {\n                                assert!(\n                                    &*k >= *expect,\n                                    \"witnessed key is {:?} but we expected \\\n                                    one >= {:?}, so we overshot due to a \\\n                                    concurrent modification\\n{:?}\",\n                                    k,\n                                    expect,\n                                    t,\n                                );\n                                if &*k == *expect {\n                                    break;\n                                }\n                            } else {\n                                panic!(\"undershot key on tree: \\n{:?}\", t);\n                            }\n                        }\n                    }\n                }\n                I.fetch_sub(1, SeqCst);\n\n                Ok(())\n            })\n            .unwrap();\n\n        threads.push(thread);\n    }\n\n    for i in 0..N_INSERT {\n        let t: Db = t.clone();\n        let barrier = barrier.clone();\n\n        let thread = thread::Builder::new()\n            .name(format!(\"insert({})\", i))\n            .spawn(move || {\n                barrier.wait();\n\n                while I.load(SeqCst) != 0 {\n                    for i in 0..(16 * 16 * 8) {\n                        let major = i / (16 * 8);\n                        let minor = i % 16;\n\n                        let mut base = INDELIBLE[major].to_vec();\n                        base.push(minor as u8);\n                        t.insert(base.clone(), base.clone())?;\n                    }\n                }\n\n                Ok(())\n            })\n            .unwrap();\n\n        threads.push(thread);\n    }\n\n    for i in 0..N_DELETE {\n        let t: Db = t.clone();\n        let barrier = barrier.clone();\n\n        let thread = thread::Builder::new()\n            .name(format!(\"deleter({})\", i))\n            .spawn(move || {\n                barrier.wait();\n\n                while I.load(SeqCst) != 0 {\n                    for i in 0..(16 * 16 * 8) {\n                        let major = i / (16 * 8);\n                        let minor = i % 16;\n\n                        let mut base = INDELIBLE[major].to_vec();\n                        base.push(minor as u8);\n                        t.remove(&base)?;\n                    }\n                }\n\n                Ok(())\n            })\n            .unwrap();\n\n        threads.push(thread);\n    }\n\n    for thread in threads.into_iter() {\n        thread.join().expect(\"thread should not have crashed\")?;\n    }\n\n    t.check_error().expect(\"Db should have no set error\");\n\n    dbg!(t.stats());\n\n    Ok(())\n}\n\n/*\n#[test]\n#[cfg(not(miri))] // can't create threads\nfn concurrent_tree_transactions() -> TransactionResult<()> {\n    use std::sync::Barrier;\n\n    common::setup_logger();\n\n    let config = Config::new()\n        .temporary(true)\n        .flush_every_ms(Some(1))\n    let db: Db = config.open().unwrap();\n\n    db.insert(b\"k1\", b\"cats\").unwrap();\n    db.insert(b\"k2\", b\"dogs\").unwrap();\n\n    let mut threads: Vec<std::thread::JoinHandle<TransactionResult<()>>> =\n        vec![];\n\n    const N_WRITERS: usize = 30;\n    const N_READERS: usize = 5;\n    const N_SUBSCRIBERS: usize = 5;\n\n    let barrier = Arc::new(Barrier::new(N_WRITERS + N_READERS + N_SUBSCRIBERS));\n\n    for _ in 0..N_WRITERS {\n        let db: Db = db.clone();\n        let barrier = barrier.clone();\n        let thread = std::thread::spawn(move || {\n            barrier.wait();\n            for _ in 0..100 {\n                db.transaction(|db| {\n                    let v1 = db.remove(b\"k1\")?.unwrap();\n                    let v2 = db.remove(b\"k2\")?.unwrap();\n\n                    db.insert(b\"k1\", v2)?;\n                    db.insert(b\"k2\", v1)?;\n\n                    Ok(())\n                })?;\n            }\n            Ok(())\n        });\n        threads.push(thread);\n    }\n\n    for _ in 0..N_READERS {\n        let db: Db = db.clone();\n        let barrier = barrier.clone();\n        let thread = std::thread::spawn(move || {\n            barrier.wait();\n            for _ in 0..1000 {\n                db.transaction(|db| {\n                    let v1 = db.get(b\"k1\")?.unwrap();\n                    let v2 = db.get(b\"k2\")?.unwrap();\n\n                    let mut results = vec![v1, v2];\n                    results.sort();\n\n                    assert_eq!([&results[0], &results[1]], [b\"cats\", b\"dogs\"]);\n\n                    Ok(())\n                })?;\n            }\n            Ok(())\n        });\n        threads.push(thread);\n    }\n\n    for _ in 0..N_SUBSCRIBERS {\n        let db: Db = db.clone();\n        let barrier = barrier.clone();\n        let thread = std::thread::spawn(move || {\n            barrier.wait();\n            let mut sub = db.watch_prefix(b\"k1\");\n            drop(db);\n\n            while sub.next_timeout(Duration::from_millis(100)).is_ok() {}\n            drop(sub);\n\n            Ok(())\n        });\n        threads.push(thread);\n    }\n\n    for thread in threads.into_iter() {\n        thread.join().unwrap()?;\n    }\n\n    let v1 = db.get(b\"k1\")?.unwrap();\n    let v2 = db.get(b\"k2\")?.unwrap();\n    assert_eq!([v1, v2], [b\"cats\", b\"dogs\"]);\n\n    Ok(())\n}\n\n#[test]\nfn tree_flush_in_transaction() {\n    let config = sled::Config::tmp().unwrap();\n    let db: Db = config.open().unwrap();\n    let tree = db.open_tree(b\"a\").unwrap();\n\n    tree.transaction::<_, _, sled::transaction::TransactionError>(|tree| {\n        tree.insert(b\"k1\", b\"cats\")?;\n        tree.insert(b\"k2\", b\"dogs\")?;\n        tree.flush();\n        Ok(())\n    })\n    .unwrap();\n}\n\n#[test]\nfn incorrect_multiple_db_transactions() -> TransactionResult<()> {\n    common::setup_logger();\n\n    let db1 =\n        Config::tmp().unwrap().flush_every_ms(Some(1)).open().unwrap();\n    let db2 =\n        Config::tmp().unwrap().flush_every_ms(Some(1)).open().unwrap();\n\n    let result: TransactionResult<()> =\n        (&*db1, &*db2).transaction::<_, ()>(|_| Ok(()));\n\n    assert!(result.is_err());\n\n    Ok(())\n}\n\n#[test]\nfn many_tree_transactions() -> TransactionResult<()> {\n    common::setup_logger();\n\n    let config = Config::tmp().unwrap().flush_every_ms(Some(1));\n    let db: Db = Arc::new(config.open().unwrap());\n    let t1 = db.open_tree(b\"1\")?;\n    let t2 = db.open_tree(b\"2\")?;\n    let t3 = db.open_tree(b\"3\")?;\n    let t4 = db.open_tree(b\"4\")?;\n    let t5 = db.open_tree(b\"5\")?;\n    let t6 = db.open_tree(b\"6\")?;\n    let t7 = db.open_tree(b\"7\")?;\n    let t8 = db.open_tree(b\"8\")?;\n    let t9 = db.open_tree(b\"9\")?;\n\n    (&t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9).transaction(|trees| {\n        trees.0.insert(\"hi\", \"there\")?;\n        trees.8.insert(\"ok\", \"thanks\")?;\n        Ok(())\n    })\n}\n\n#[test]\nfn batch_outside_of_transaction() -> TransactionResult<()> {\n    common::setup_logger();\n\n    let config = Config::tmp().unwrap().flush_every_ms(Some(1));\n    let db: Db = config.open().unwrap();\n\n    let t1 = db.open_tree(b\"1\")?;\n\n    let mut b1 = Batch::default();\n    b1.insert(b\"k1\", b\"v1\");\n    b1.insert(b\"k2\", b\"v2\");\n\n    t1.transaction(|tree| {\n        tree.apply_batch(&b1)?;\n        Ok(())\n    })?;\n\n    assert_eq!(t1.get(b\"k1\")?, Some(b\"v1\".into()));\n    assert_eq!(t1.get(b\"k2\")?, Some(b\"v2\".into()));\n    Ok(())\n}\n*/\n\n#[test]\nfn tree_subdir() {\n    let mut parent_path = std::env::temp_dir();\n    parent_path.push(\"test_tree_subdir\");\n\n    let _ = std::fs::remove_dir_all(&parent_path);\n\n    let mut path = parent_path.clone();\n    path.push(\"test_subdir\");\n\n    let config = Config::new().path(&path);\n\n    let t: Db = config.open().unwrap();\n\n    t.insert(&[1], vec![1]).unwrap();\n\n    drop(t);\n\n    let config = Config::new().path(&path);\n\n    let t: Db = config.open().unwrap();\n\n    let res = t.get(&*vec![1]);\n\n    assert_eq!(res.unwrap().unwrap(), vec![1_u8]);\n\n    drop(t);\n\n    std::fs::remove_dir_all(&parent_path).unwrap();\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_small_keys_iterator() {\n    let config = Config::tmp().unwrap().flush_every_ms(Some(1));\n    let t: Db = config.open().unwrap();\n    for i in 0..N_PER_THREAD {\n        let k = kv(i);\n        t.insert(&k, k.clone()).unwrap();\n    }\n\n    for (i, (k, v)) in t.iter().map(|res| res.unwrap()).enumerate() {\n        let should_be = kv(i);\n        assert_eq!(should_be, &*k);\n        assert_eq!(should_be, &*v);\n    }\n\n    for (i, (k, v)) in t.iter().map(|res| res.unwrap()).enumerate() {\n        let should_be = kv(i);\n        assert_eq!(should_be, &*k);\n        assert_eq!(should_be, &*v);\n    }\n\n    let half_way = N_PER_THREAD / 2;\n    let half_key = kv(half_way);\n    let mut tree_scan = t.range(&*half_key..);\n    let r1 = tree_scan.next().unwrap().unwrap();\n    assert_eq!((r1.0.as_ref(), &*r1.1), (half_key.as_ref(), &*half_key));\n\n    let first_key = kv(0);\n    let mut tree_scan = t.range(&*first_key..);\n    let r2 = tree_scan.next().unwrap().unwrap();\n    assert_eq!((r2.0.as_ref(), &*r2.1), (first_key.as_ref(), &*first_key));\n\n    let last_key = kv(N_PER_THREAD - 1);\n    let mut tree_scan = t.range(&*last_key..);\n    let r3 = tree_scan.next().unwrap().unwrap();\n    assert_eq!((r3.0.as_ref(), &*r3.1), (last_key.as_ref(), &*last_key));\n    assert!(tree_scan.next().is_none());\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_big_keys_iterator() {\n    fn kv(i: usize) -> Vec<u8> {\n        let k = [(i >> 16) as u8, (i >> 8) as u8, i as u8];\n\n        let mut base = vec![0; u8::MAX as usize];\n        base.extend_from_slice(&k);\n        base\n    }\n\n    let config = Config::tmp().unwrap().flush_every_ms(Some(1));\n\n    let t: Db = config.open().unwrap();\n    for i in 0..N_PER_THREAD {\n        let k = kv(i);\n        t.insert(&k, k.clone()).unwrap();\n    }\n\n    for (i, (k, v)) in t.iter().map(|res| res.unwrap()).enumerate() {\n        let should_be = kv(i);\n        assert_eq!(should_be, &*k, \"{:#?}\", t);\n        assert_eq!(should_be, &*v);\n    }\n\n    for (i, (k, v)) in t.iter().map(|res| res.unwrap()).enumerate() {\n        let should_be = kv(i);\n        assert_eq!(should_be, &*k);\n        assert_eq!(should_be, &*v);\n    }\n\n    let half_way = N_PER_THREAD / 2;\n    let half_key = kv(half_way);\n    let mut tree_scan = t.range(&*half_key..);\n    let r1 = tree_scan.next().unwrap().unwrap();\n    assert_eq!((r1.0.as_ref(), &*r1.1), (half_key.as_ref(), &*half_key));\n\n    let first_key = kv(0);\n    let mut tree_scan = t.range(&*first_key..);\n    let r2 = tree_scan.next().unwrap().unwrap();\n    assert_eq!((r2.0.as_ref(), &*r2.1), (first_key.as_ref(), &*first_key));\n\n    let last_key = kv(N_PER_THREAD - 1);\n    let mut tree_scan = t.range(&*last_key..);\n    let r3 = tree_scan.next().unwrap().unwrap();\n    assert_eq!((r3.0.as_ref(), &*r3.1), (last_key.as_ref(), &*last_key));\n    assert!(tree_scan.next().is_none());\n}\n\n/*\n#[test]\nfn tree_subscribers_and_keyspaces() -> io::Result<()> {\n    let config = Config::tmp().unwrap().flush_every_ms(Some(1));\n\n    let db: Db = config.open().unwrap();\n\n    let t1 = db.open_tree(b\"1\")?;\n    let mut s1 = t1.watch_prefix(b\"\");\n\n    let t2 = db.open_tree(b\"2\")?;\n    let mut s2 = t2.watch_prefix(b\"\");\n\n    t1.insert(b\"t1_a\", b\"t1_a\".to_vec())?;\n    t2.insert(b\"t2_a\", b\"t2_a\".to_vec())?;\n\n    assert_eq!(s1.next().unwrap().iter().next().unwrap().1, b\"t1_a\");\n    assert_eq!(s2.next().unwrap().iter().next().unwrap().1, b\"t2_a\");\n\n    drop(db);\n    drop(t1);\n    drop(t2);\n\n    let db: Db = config.open().unwrap();\n\n    let t1 = db.open_tree(b\"1\")?;\n    let mut s1 = t1.watch_prefix(b\"\");\n\n    let t2 = db.open_tree(b\"2\")?;\n    let mut s2 = t2.watch_prefix(b\"\");\n\n    assert!(db.is_empty());\n    assert_eq!(t1.len(), 1);\n    assert_eq!(t2.len(), 1);\n\n    t1.insert(b\"t1_b\", b\"t1_b\".to_vec())?;\n    t2.insert(b\"t2_b\", b\"t2_b\".to_vec())?;\n\n    assert_eq!(s1.next().unwrap().iter().next().unwrap().1, b\"t1_b\");\n    assert_eq!(s2.next().unwrap().iter().next().unwrap().1, b\"t2_b\");\n\n    drop(db);\n    drop(t1);\n    drop(t2);\n\n    let db: Db = config.open().unwrap();\n\n    let t1 = db.open_tree(b\"1\")?;\n    let t2 = db.open_tree(b\"2\")?;\n\n    assert!(db.is_empty());\n    assert_eq!(t1.len(), 2);\n    assert_eq!(t2.len(), 2);\n\n    db.drop_tree(b\"1\")?;\n    db.drop_tree(b\"2\")?;\n\n    assert_eq!(t1.get(b\"\"), Err(Error::CollectionNotFound));\n\n    assert_eq!(t2.get(b\"\"), Err(Error::CollectionNotFound));\n\n    let guard = pin();\n    guard.flush();\n    drop(guard);\n\n    drop(db);\n    drop(t1);\n    drop(t2);\n\n    let db: Db = config.open().unwrap();\n\n    let t1 = db.open_tree(b\"1\")?;\n    let t2 = db.open_tree(b\"2\")?;\n\n    assert!(db.is_empty());\n    assert_eq!(t1.len(), 0);\n    assert_eq!(t2.len(), 0);\n\n    Ok(())\n}\n*/\n\n#[test]\nfn tree_range() {\n    common::setup_logger();\n\n    let config = Config::tmp().unwrap().flush_every_ms(Some(1));\n    let t: sled::Db<7> = config.open().unwrap();\n\n    t.insert(b\"0\", vec![0]).unwrap();\n    t.insert(b\"1\", vec![10]).unwrap();\n    t.insert(b\"2\", vec![20]).unwrap();\n    t.insert(b\"3\", vec![30]).unwrap();\n    t.insert(b\"4\", vec![40]).unwrap();\n    t.insert(b\"5\", vec![50]).unwrap();\n\n    let start: &[u8] = b\"2\";\n    let end: &[u8] = b\"4\";\n    let mut r = t.range(start..end);\n    assert_eq!(r.next().unwrap().unwrap().0, b\"2\");\n    assert_eq!(r.next().unwrap().unwrap().0, b\"3\");\n    assert!(r.next().is_none());\n\n    let start = b\"2\".to_vec();\n    let end = b\"4\".to_vec();\n    let mut r = t.range(start..end).rev();\n    assert_eq!(r.next().unwrap().unwrap().0, b\"3\");\n    assert_eq!(r.next().unwrap().unwrap().0, b\"2\");\n    assert!(r.next().is_none());\n\n    let start = b\"2\".to_vec();\n    let mut r = t.range(start..);\n    assert_eq!(r.next().unwrap().unwrap().0, b\"2\");\n    assert_eq!(r.next().unwrap().unwrap().0, b\"3\");\n    assert_eq!(r.next().unwrap().unwrap().0, b\"4\");\n    assert_eq!(r.next().unwrap().unwrap().0, b\"5\");\n    assert!(r.next().is_none());\n\n    let start = b\"2\".to_vec();\n    let mut r = t.range(..=start).rev();\n    assert_eq!(\n        r.next().unwrap().unwrap().0,\n        b\"2\",\n        \"failed to find 2 in tree {:?}\",\n        t\n    );\n    assert_eq!(r.next().unwrap().unwrap().0, b\"1\");\n    assert_eq!(r.next().unwrap().unwrap().0, b\"0\");\n    assert!(r.next().is_none());\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn recover_tree() {\n    common::setup_logger();\n\n    let config = Config::tmp().unwrap().flush_every_ms(Some(1));\n\n    let t: sled::Db<7> = config.open().unwrap();\n    for i in 0..N_PER_THREAD {\n        let k = kv(i);\n        t.insert(&k, k.clone()).unwrap();\n    }\n    drop(t);\n\n    let t: sled::Db<7> = config.open().unwrap();\n    for i in 0..N_PER_THREAD {\n        let k = kv(i as usize);\n        assert_eq!(t.get(&*k).unwrap().unwrap(), k);\n        t.remove(&*k).unwrap();\n    }\n    drop(t);\n\n    println!(\n        \"---------------- recovering a (hopefully) empty db ----------------------\"\n    );\n\n    let t: sled::Db<7> = config.open().unwrap();\n    for i in 0..N_PER_THREAD {\n        let k = kv(i as usize);\n        assert!(\n            t.get(&*k).unwrap().is_none(),\n            \"expected key {:?} to have been deleted\",\n            i\n        );\n    }\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_gc() {\n    const FANOUT: usize = 7;\n\n    common::setup_logger();\n\n    let config = Config::tmp().unwrap().flush_every_ms(None);\n\n    let t: sled::Db<FANOUT> = config.open().unwrap();\n\n    for i in 0..N {\n        let k = kv(i);\n        t.insert(&k, k.clone()).unwrap();\n    }\n\n    for _ in 0..100 {\n        t.flush().unwrap();\n    }\n\n    let size_on_disk_after_inserts = t.size_on_disk().unwrap();\n\n    for i in 0..N {\n        let k = kv(i);\n        t.insert(&k, k.clone()).unwrap();\n    }\n\n    for _ in 0..100 {\n        t.flush().unwrap();\n    }\n\n    let size_on_disk_after_rewrites = t.size_on_disk().unwrap();\n\n    for i in 0..N {\n        let k = kv(i);\n        assert_eq!(t.get(&*k).unwrap(), Some(k.clone().into()), \"{k:?}\");\n        t.remove(&*k).unwrap();\n    }\n\n    for _ in 0..100 {\n        t.flush().unwrap();\n    }\n\n    let size_on_disk_after_deletes = t.size_on_disk().unwrap();\n\n    t.check_error().expect(\"Db should have no set error\");\n\n    let stats = t.stats();\n\n    dbg!(stats);\n\n    assert!(\n        stats.cache.heap.allocator.objects_allocated >= (N / FANOUT) as u64,\n        \"{stats:?}\"\n    );\n    assert!(\n        stats.cache.heap.allocator.objects_freed\n            >= (stats.cache.heap.allocator.objects_allocated / 2) as u64,\n        \"{stats:?}\"\n    );\n    assert!(\n        stats.cache.heap.allocator.heap_slots_allocated >= (N / FANOUT) as u64,\n        \"{stats:?}\"\n    );\n    assert!(\n        stats.cache.heap.allocator.heap_slots_freed\n            >= (stats.cache.heap.allocator.heap_slots_allocated / 2) as u64,\n        \"{stats:?}\"\n    );\n\n    let expected_max_size = size_on_disk_after_inserts / 15;\n    assert!(\n        size_on_disk_after_deletes <= expected_max_size,\n        \"expected file truncation to take size under {expected_max_size} \\\n        but it was {size_on_disk_after_deletes}\"\n    );\n    // TODO assert!(stats.cache.heap.truncated_file_bytes > 0);\n\n    println!(\n        \"after writing {N} items and removing them, disk size went \\\n        from {}kb after inserts to {}kb after rewriting to {}kb after deletes\",\n        size_on_disk_after_inserts / 1024,\n        size_on_disk_after_rewrites / 1024,\n        size_on_disk_after_deletes / 1024,\n    );\n}\n\n/*\n#[test]\nfn create_exclusive() {\n    common::setup_logger();\n\n    let path = \"create_exclusive_db\";\n    let _ = std::fs::remove_dir_all(path);\n\n    {\n        let config = Config::new().create_new(true).path(path);\n        config.open().unwrap();\n    }\n\n    let config = Config::new().create_new(true).path(path);\n    config.open().unwrap_err();\n    std::fs::remove_dir_all(path).unwrap();\n}\n*/\n\n#[test]\nfn contains_tree() {\n    let db: Db = Config::tmp().unwrap().flush_every_ms(None).open().unwrap();\n    let tree_one = db.open_tree(\"tree 1\").unwrap();\n    let tree_two = db.open_tree(\"tree 2\").unwrap();\n\n    drop(tree_one);\n    drop(tree_two);\n\n    assert_eq!(false, db.contains_tree(\"tree 3\").unwrap());\n    assert_eq!(true, db.contains_tree(\"tree 1\").unwrap());\n    assert_eq!(true, db.contains_tree(\"tree 2\").unwrap());\n\n    assert!(db.drop_tree(\"tree 1\").unwrap());\n    assert_eq!(false, db.contains_tree(\"tree 1\").unwrap());\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn tree_import_export() -> io::Result<()> {\n    common::setup_logger();\n\n    let config_1 = Config::tmp().unwrap();\n    let config_2 = Config::tmp().unwrap();\n\n    let db: Db = config_1.open()?;\n    for db_id in 0..N_THREADS {\n        let tree_id = format!(\"tree_{}\", db_id);\n        let tree = db.open_tree(tree_id.as_bytes())?;\n        for i in 0..N_THREADS {\n            let k = kv(i);\n            tree.insert(&k, k.clone()).unwrap();\n        }\n    }\n\n    let checksum_a = db.checksum().unwrap();\n\n    drop(db);\n\n    let exporter: Db = config_1.open()?;\n    let importer: Db = config_2.open()?;\n\n    let export = exporter.export();\n    importer.import(export);\n\n    drop(exporter);\n    drop(config_1);\n    drop(importer);\n\n    let db: Db = config_2.open()?;\n\n    let checksum_b = db.checksum().unwrap();\n    assert_eq!(checksum_a, checksum_b);\n\n    for db_id in 0..N_THREADS {\n        let tree_id = format!(\"tree_{}\", db_id);\n        let tree = db.open_tree(tree_id.as_bytes())?;\n\n        for i in 0..N_THREADS {\n            let k = kv(i as usize);\n            assert_eq!(tree.get(&*k).unwrap().unwrap(), k);\n            tree.remove(&*k).unwrap();\n        }\n    }\n\n    let checksum_c = db.checksum().unwrap();\n\n    drop(db);\n\n    let db: Db = config_2.open()?;\n    for db_id in 0..N_THREADS {\n        let tree_id = format!(\"tree_{}\", db_id);\n        let tree = db.open_tree(tree_id.as_bytes())?;\n\n        for i in 0..N_THREADS {\n            let k = kv(i as usize);\n            assert_eq!(tree.get(&*k).unwrap(), None);\n        }\n    }\n\n    let checksum_d = db.checksum().unwrap();\n    assert_eq!(checksum_c, checksum_d);\n\n    Ok(())\n}\n\n#[test]\n#[cfg_attr(any(target_os = \"fuchsia\", miri), ignore)]\nfn quickcheck_tree_matches_btreemap() {\n    let n_tests = if cfg!(windows) { 25 } else { 100 };\n\n    QuickCheck::new()\n        .r#gen(Gen::new(100))\n        .tests(n_tests)\n        .max_tests(n_tests * 10)\n        .quickcheck(\n            prop_tree_matches_btreemap as fn(Vec<Op>, bool, i32, usize) -> bool,\n        );\n}\n"
  },
  {
    "path": "tests/test_tree_failpoints.rs",
    "content": "#![cfg(feature = \"failpoints\")]\nmod common;\n\nuse std::collections::BTreeMap;\nuse std::convert::TryInto;\nuse std::sync::Mutex;\n\nuse quickcheck::{Arbitrary, Gen, QuickCheck, StdGen};\nuse rand::{Rng, seq::SliceRandom};\n\nuse sled::*;\n\nconst SEGMENT_SIZE: usize = 256;\nconst BATCH_COUNTER_KEY: &[u8] = b\"batch_counter\";\n\n#[derive(Debug, Clone)]\nenum Op {\n    Set,\n    Del(u8),\n    Id,\n    Batched(Vec<BatchOp>),\n    Restart,\n    Flush,\n    FailPoint(&'static str, u64),\n}\n\n#[derive(Debug, Clone)]\nenum BatchOp {\n    Set,\n    Del(u8),\n}\n\nimpl Arbitrary for BatchOp {\n    fn arbitrary<G: Gen>(g: &mut G) -> BatchOp {\n        if g.gen_ratio(1, 2) {\n            BatchOp::Set\n        } else {\n            BatchOp::Del(g.r#gen::<u8>())\n        }\n    }\n}\n\nuse self::Op::*;\n\nimpl Arbitrary for Op {\n    fn arbitrary<G: Gen>(g: &mut G) -> Op {\n        let fail_points = vec![\n            \"buffer write\",\n            \"zero garbage segment\",\n            \"zero garbage segment post\",\n            \"zero garbage segment SA\",\n            \"buffer write post\",\n            \"write_config bytes\",\n            \"write_config crc\",\n            \"write_config fsync\",\n            \"write_config rename\",\n            \"write_config dir fsync\",\n            \"write_config post\",\n            \"segment initial free zero\",\n            \"snap write\",\n            \"snap write len\",\n            \"snap write crc\",\n            \"snap write post\",\n            \"snap write mv\",\n            \"snap write dir fsync\",\n            \"snap write mv post\",\n            \"snap write rm old\",\n            \"blob blob write\",\n            \"write_blob write crc\",\n            \"write_blob write kind_byte\",\n            \"write_blob write buf\",\n            \"file truncation\",\n            \"pwrite\",\n            \"pwrite partial\",\n        ];\n\n        if g.gen_bool(1. / 30.) {\n            return FailPoint(fail_points.choose(g).unwrap(), g.r#gen::<u64>());\n        }\n\n        if g.gen_bool(1. / 10.) {\n            return Restart;\n        }\n\n        let choice = g.gen_range(0, 5);\n\n        match choice {\n            0 => Set,\n            1 => Del(g.r#gen::<u8>()),\n            2 => Id,\n            3 => Batched(Arbitrary::arbitrary(g)),\n            4 => Flush,\n            _ => panic!(\"impossible choice\"),\n        }\n    }\n\n    fn shrink(&self) -> Box<dyn Iterator<Item = Op>> {\n        match self {\n            Del(ref lid) if *lid > 0 => {\n                Box::new(vec![Del(*lid / 2), Del(*lid - 1)].into_iter())\n            }\n            Batched(batch_ops) => Box::new(batch_ops.shrink().map(Batched)),\n            FailPoint(name, bitset) => {\n                if bitset.count_ones() > 1 {\n                    Box::new(\n                        vec![\n                            // clear last failure bit\n                            FailPoint(\n                                name,\n                                bitset ^ (1 << (63 - bitset.leading_zeros())),\n                            ),\n                            // clear first failure bit\n                            FailPoint(\n                                name,\n                                bitset ^ (1 << bitset.trailing_zeros()),\n                            ),\n                            // rewind all failure bits by one call\n                            FailPoint(name, bitset >> 1),\n                        ]\n                        .into_iter(),\n                    )\n                } else if *bitset > 1 {\n                    Box::new(vec![FailPoint(name, bitset >> 1)].into_iter())\n                } else {\n                    Box::new(vec![].into_iter())\n                }\n            }\n            _ => Box::new(vec![].into_iter()),\n        }\n    }\n}\n\nfn v(b: &[u8]) -> u16 {\n    if b[0] % 4 != 0 {\n        assert_eq!(b.len(), 2);\n    }\n    (u16::from(b[0]) << 8) + u16::from(b[1])\n}\n\nfn value_factory(set_counter: u16) -> Vec<u8> {\n    let hi = (set_counter >> 8) as u8;\n    let lo = set_counter as u8;\n    if hi % 4 == 0 {\n        let mut val = vec![hi, lo];\n        val.extend(vec![\n            lo;\n            hi as usize * SEGMENT_SIZE / 4 * set_counter as usize\n        ]);\n        val\n    } else {\n        vec![hi, lo]\n    }\n}\n\nfn tear_down_failpoints() {\n    sled::fail::reset();\n}\n\n#[derive(Debug)]\nstruct ReferenceVersion {\n    value: Option<u16>,\n    batch: Option<u32>,\n}\n\n#[derive(Debug)]\nstruct ReferenceEntry {\n    versions: Vec<ReferenceVersion>,\n    crash_epoch: u32,\n}\n\nfn prop_tree_crashes_nicely(ops: Vec<Op>, flusher: bool) -> bool {\n    // forces quickcheck to run one thread at a time\n    static M: Lazy<Mutex<()>, fn() -> Mutex<()>> = Lazy::new(|| Mutex::new(()));\n\n    let _lock = M.lock().expect(\"our test lock should not be poisoned\");\n\n    // clear all failpoints that may be left over from the last run\n    tear_down_failpoints();\n\n    let res = std::panic::catch_unwind(|| {\n        run_tree_crashes_nicely(ops.clone(), flusher)\n    });\n\n    tear_down_failpoints();\n\n    match res {\n        Err(e) => {\n            println!(\n                \"failed with {:?} on ops {:?} flusher {}\",\n                e, ops, flusher\n            );\n            false\n        }\n        Ok(res) => {\n            if !res {\n                println!(\"failed with ops {:?} flusher: {}\", ops, flusher);\n            }\n            res\n        }\n    }\n}\n\nfn run_tree_crashes_nicely(ops: Vec<Op>, flusher: bool) -> bool {\n    common::setup_logger();\n\n    let config = Config::new()\n        .temporary(true)\n        .flush_every_ms(if flusher { Some(1) } else { None })\n        .cache_capacity(256)\n        .idgen_persist_interval(1)\n        .segment_size(SEGMENT_SIZE);\n\n    let mut tree = config.open().expect(\"tree should start\");\n    let mut reference = BTreeMap::new();\n    let mut max_id: isize = -1;\n    let mut crash_counter = 0;\n    let mut batch_counter: u32 = 1;\n\n    // For each Set operation, one entry is inserted to the tree with a two-byte\n    // key, and a variable-length value. The key is set to the encoded value\n    // of the `set_counter`, which increments by one with each Set\n    // operation. The value starts with the same two bytes as the\n    // key does, but some values are extended to be many segments long.\n    //\n    // Del operations delete one entry from the tree. Only keys from 0 to 255\n    // are eligible for deletion.\n\n    macro_rules! restart {\n        () => {\n            drop(tree);\n            let tree_res = config.global_error().and_then(|_| config.open());\n            tree = match tree_res {\n                Err(Error::FailPoint) => return true,\n                Err(e) => {\n                    println!(\"could not start database: {}\", e);\n                    return false;\n                }\n                Ok(tree) => tree,\n            };\n\n            let stable_batch = match tree.get(BATCH_COUNTER_KEY) {\n                Ok(Some(value)) => u32::from_be_bytes(value.as_ref().try_into().unwrap()),\n                Ok(None) => 0,\n                Err(Error::FailPoint) => return true,\n                Err(other) => panic!(\"failed to fetch batch counter after restart: {:?}\", other),\n            };\n            for (_, ref_entry) in reference.iter_mut() {\n                if ref_entry.versions.len() == 1 {\n                    continue;\n                }\n                // find the last version from a stable batch, if there is one,\n                // throw away all preceeding versions\n                let committed_find_result = ref_entry.versions.iter().enumerate().rev().find(|(_, ReferenceVersion{ batch, value: _ })| match batch {\n                    Some(batch) => *batch <= stable_batch,\n                    None => false,\n                });\n                if let Some((committed_index, _)) = committed_find_result {\n                    let tail_versions = ref_entry.versions.split_off(committed_index);\n                    let _ = std::mem::replace(&mut ref_entry.versions, tail_versions);\n                }\n                // find the first version from a batch that wasn't committed,\n                // throw away it and all subsequent versions\n                let discarded_find_result = ref_entry.versions.iter().enumerate().find(|(_, ReferenceVersion{ batch, value: _})| match batch {\n                    Some(batch) => *batch > stable_batch,\n                    None => false,\n                });\n                if let Some((discarded_index, _)) = discarded_find_result {\n                    let _ = ref_entry.versions.split_off(discarded_index);\n                }\n            }\n\n            let mut ref_iter = reference.iter().map(|(ref rk, ref rv)| (**rk, *rv));\n            for res in tree.iter() {\n                let actual = match res {\n                    Ok((ref tk, _)) => {\n                        if tk == BATCH_COUNTER_KEY {\n                            continue;\n                        }\n                        v(tk)\n                    }\n                    Err(Error::FailPoint) => return true,\n                    Err(other) => panic!(\"failed to iterate over items in tree after restart: {:?}\", other),\n                };\n\n                // make sure the tree value is in there\n                while let Some((ref_key, ref_expected)) = ref_iter.next() {\n                    if ref_expected.versions.iter().all(|version| version.value.is_none()) {\n                        // this key should not be present in the tree, skip it and move on to the\n                        // next entry in the reference\n                        continue;\n                    } else if ref_expected.versions.iter().all(|version| version.value.is_some()) {\n                        // this key must be present in the tree, check if the keys from both\n                        // iterators match\n                        if actual != ref_key {\n                            panic!(\n                                \"expected to iterate over key {:?} but got {:?} instead due to it being missing in \\n\\ntree: {:?}\\n\\nref: {:?}\\n\",\n                                ref_key,\n                                actual,\n                                tree,\n                                reference,\n\n                            );\n                        }\n                        break;\n                    } else {\n                        // according to the reference, this key could either be present or absent,\n                        // depending on whether recent writes were successful. check whether the\n                        // keys from the two iterators match, if they do, the key happens to be\n                        // present, which is okay, if they don't, and the tree iterator is further\n                        // ahead than the reference iterator, the key happens to be absent, so we\n                        // skip the entry in the reference. if the reference iterator ever gets\n                        // further than the tree iterator, that means the tree has a key that it\n                        // should not.\n                        if actual == ref_key {\n                            // tree and reference agree, we can move on to the next tree item\n                            break;\n                        } else if ref_key > actual {\n                            // we have a bug, the reference iterator should always be <= tree\n                            // (this means that the key t was in the tree, but it wasn't in\n                            // the reference, so the reference iterator has advanced on past t)\n                            println!(\n                                \"tree verification failed: expected {:?} got {:?}\",\n                                ref_key,\n                                actual\n                            );\n                            return false;\n                        } else {\n                            // we are iterating through the reference until we have an item that\n                            // must be present or an uncertain item that matches the tree's real\n                            // item anyway\n                            continue;\n                        }\n                    }\n                }\n            }\n\n            // finish the rest of the reference iterator, and confirm the tree isn't missing\n            // any keys it needs to have at the end\n            while let Some((ref_key, ref_expected)) = ref_iter.next() {\n                if ref_expected.versions.iter().all(|version| version.value.is_some()) {\n                    // this key had to be present, but we got to the end of the tree without\n                    // seeing it\n                    println!(\"tree verification failed: expected {:?} got end\", ref_key);\n                    println!(\"expected: {:?}\", ref_expected);\n                    println!(\"tree: {:?}\", tree);\n                    return false;\n                }\n            }\n            println!(\"finished verification\");\n        }\n    }\n\n    macro_rules! fp_crash {\n        ($e:expr) => {\n            match $e {\n                Ok(thing) => thing,\n                Err(Error::FailPoint) => {\n                    tear_down_failpoints();\n                    crash_counter += 1;\n                    restart!();\n                    continue;\n                }\n                other => {\n                    println!(\"got non-failpoint err: {:?}\", other);\n                    return false;\n                }\n            }\n        };\n    }\n\n    let mut set_counter = 0u16;\n\n    println!(\"ops: {:?}\", ops);\n\n    for op in ops.into_iter() {\n        match op {\n            Set => {\n                // update the reference to show that this key could be present.\n                // the next Flush operation will update the\n                // reference again, and require this key to be present\n                // (unless there's a crash before then).\n                let reference_entry = reference\n                    .entry(set_counter)\n                    .or_insert_with(|| ReferenceEntry {\n                        versions: vec![ReferenceVersion {\n                            value: None,\n                            batch: None,\n                        }],\n                        crash_epoch: crash_counter,\n                    });\n                reference_entry.versions.push(ReferenceVersion {\n                    value: Some(set_counter),\n                    batch: None,\n                });\n                reference_entry.crash_epoch = crash_counter;\n\n                fp_crash!(tree.insert(\n                    &u16::to_be_bytes(set_counter),\n                    value_factory(set_counter),\n                ));\n\n                set_counter += 1;\n            }\n            Del(k) => {\n                // if this key was already set, update the reference to show\n                // that this key could either be present or\n                // absent. the next Flush operation will update the reference\n                // again, and require this key to be absent (unless there's a\n                // crash before then).\n                reference.entry(u16::from(k)).and_modify(|v| {\n                    v.versions\n                        .push(ReferenceVersion { value: None, batch: None });\n                    v.crash_epoch = crash_counter;\n                });\n\n                fp_crash!(tree.remove(&*vec![0, k]));\n            }\n            Id => {\n                let id = fp_crash!(tree.generate_id());\n                assert!(\n                    id as isize > max_id,\n                    \"generated id of {} is not larger \\\n                     than previous max id of {}\",\n                    id,\n                    max_id,\n                );\n                max_id = id as isize;\n            }\n            Batched(batch_ops) => {\n                let mut batch = Batch::default();\n                batch.insert(\n                    BATCH_COUNTER_KEY,\n                    batch_counter.to_be_bytes().to_vec(),\n                );\n                for batch_op in batch_ops {\n                    match batch_op {\n                        BatchOp::Set => {\n                            let reference_entry = reference\n                                .entry(set_counter)\n                                .or_insert_with(|| ReferenceEntry {\n                                    versions: vec![ReferenceVersion {\n                                        value: None,\n                                        batch: None,\n                                    }],\n                                    crash_epoch: crash_counter,\n                                });\n                            reference_entry.versions.push(ReferenceVersion {\n                                value: Some(set_counter),\n                                batch: Some(batch_counter),\n                            });\n                            reference_entry.crash_epoch = crash_counter;\n\n                            batch.insert(\n                                u16::to_be_bytes(set_counter).to_vec(),\n                                value_factory(set_counter),\n                            );\n\n                            set_counter += 1;\n                        }\n                        BatchOp::Del(k) => {\n                            reference.entry(u16::from(k)).and_modify(|v| {\n                                v.versions.push(ReferenceVersion {\n                                    value: None,\n                                    batch: Some(batch_counter),\n                                });\n                                v.crash_epoch = crash_counter;\n                            });\n\n                            batch.remove(u16::to_be_bytes(k.into()).to_vec());\n                        }\n                    }\n                }\n                batch_counter += 1;\n                fp_crash!(tree.apply_batch(batch));\n            }\n            Flush => {\n                fp_crash!(tree.flush());\n\n                // once a flush has been successfully completed, recent Set/Del\n                // operations should be durable. go through the\n                // reference, and if a Set/Del operation was done since\n                // the last crash, keep the value for that key corresponding to\n                // the most recent operation, and toss the rest.\n                for (_key, reference_entry) in reference.iter_mut() {\n                    if reference_entry.versions.len() > 1\n                        && reference_entry.crash_epoch == crash_counter\n                    {\n                        let last =\n                            std::mem::take(&mut reference_entry.versions)\n                                .pop()\n                                .unwrap();\n                        reference_entry.versions.push(last);\n                    }\n                }\n            }\n            Restart => {\n                restart!();\n            }\n            FailPoint(fp, bitset) => {\n                sled::fail::set(fp, bitset);\n            }\n        }\n    }\n\n    true\n}\n\n#[test]\n#[cfg_attr(any(target_os = \"fuchsia\", miri), ignore)]\nfn quickcheck_tree_with_failpoints() {\n    // use fewer tests for travis OSX builds that stall out all the time\n    let mut n_tests = 50;\n    if let Ok(Ok(value)) = std::env::var(\"QUICKCHECK_TESTS\").map(|s| s.parse())\n    {\n        n_tests = value;\n    }\n\n    let generator_sz = 100;\n\n    QuickCheck::new()\n        .r#gen(StdGen::new(rand::rng(), generator_sz))\n        .tests(n_tests)\n        .quickcheck(prop_tree_crashes_nicely as fn(Vec<Op>, bool) -> bool);\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_01() {\n    // postmortem 1: model did not account for proper reasons to fail to start\n    assert!(prop_tree_crashes_nicely(\n        vec![FailPoint(\"snap write\", 0xFFFFFFFFFFFFFFFF), Restart],\n        false,\n    ));\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_02() {\n    // postmortem 1: the system was assuming the happy path across failpoints\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            FailPoint(\"buffer write post\", 0xFFFFFFFFFFFFFFFF),\n            Set,\n            Set,\n            Restart\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_03() {\n    // postmortem 1: this was a regression that happened because we\n    // chose to eat errors about advancing snapshots, which trigger\n    // log flushes. We should not trigger flushes from snapshots,\n    // but first we need to make sure we are better about detecting\n    // tears, by not also using 0 as a failed flush signifier.\n    assert!(prop_tree_crashes_nicely(\n        vec![Set, Set, Set, Set, Set, Set, Set, Set, Restart,],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_04() {\n    // postmortem 1: the test model was not properly accounting for\n    // writes that may-or-may-not be present due to an error.\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Set,\n            FailPoint(\"snap write\", 0xFFFFFFFFFFFFFFFF),\n            Del(0),\n            Set,\n            Restart\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_05() {\n    // postmortem 1:\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Set,\n            FailPoint(\"snap write mv post\", 0xFFFFFFFFFFFFFFFF),\n            Set,\n            FailPoint(\"snap write\", 0xFFFFFFFFFFFFFFFF),\n            Set,\n            Set,\n            Set,\n            Restart,\n            FailPoint(\"zero segment\", 0xFFFFFFFFFFFFFFFF),\n            Set,\n            Set,\n            Set,\n            Restart,\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_06() {\n    // postmortem 1:\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Set,\n            Del(0),\n            Set,\n            Set,\n            Set,\n            Restart,\n            FailPoint(\"zero segment post\", 0xFFFFFFFFFFFFFFFF),\n            Set,\n            Set,\n            Set,\n            Restart,\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_07() {\n    // postmortem 1: We were crashing because a Segment was\n    // in the SegmentAccountant's to_clean Vec, but it had\n    // no present pages. This can legitimately happen when\n    // a Segment only contains failed log flushes.\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Del(17),\n            Del(29),\n            Del(246),\n            Del(248),\n            Set,\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_08() {\n    // postmortem 1: we were assuming that deletes would fail if buffer writes\n    // are disabled, but that's not true, because deletes might not cause any\n    // writes if the value was not present.\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Del(0),\n            FailPoint(\"buffer write post\", 0xFFFFFFFFFFFFFFFF),\n            Del(179),\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_09() {\n    // postmortem 1: recovery was not properly accounting for\n    // ordering issues around allocation and freeing of pages.\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Set,\n            Restart,\n            Del(110),\n            Del(0),\n            Set,\n            Restart,\n            Set,\n            Del(255),\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Del(38),\n            Set,\n            Set,\n            Del(253),\n            Set,\n            Restart,\n            Set,\n            Del(19),\n            Set,\n            Del(118),\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Del(151),\n            Set,\n            Set,\n            Del(201),\n            Set,\n            Restart,\n            Set,\n            Set,\n            Del(17),\n            Set,\n            Set,\n            Set,\n            Del(230),\n            Set,\n            Restart,\n        ],\n        true,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_10() {\n    // expected to iterate over 50 but got 49 instead\n    // postmortem 1:\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Del(175),\n            Del(19),\n            Restart,\n            Del(155),\n            Del(111),\n            Set,\n            Del(4),\n            Set,\n            Set,\n            Set,\n            Set,\n            Restart,\n            Del(94),\n            Set,\n            Del(83),\n            Del(181),\n            Del(218),\n            Set,\n            Set,\n            Del(60),\n            Del(248),\n            Set,\n            Set,\n            Set,\n            Del(167),\n            Del(180),\n            Del(180),\n            Set,\n            Restart,\n            Del(14),\n            Set,\n            Set,\n            Del(156),\n            Del(29),\n            Del(190),\n            Set,\n            Set,\n            Del(245),\n            Set,\n            Del(231),\n            Del(95),\n            Set,\n            Restart,\n            Set,\n            Del(189),\n            Set,\n            Restart,\n            Set,\n            Del(249),\n            Set,\n            Set,\n            Del(110),\n            Del(75),\n            Set,\n            Restart,\n            Del(156),\n            Del(140),\n            Del(101),\n            Del(45),\n            Del(115),\n            Del(162),\n            Set,\n            Set,\n            Del(192),\n            Del(31),\n            Del(224),\n            Set,\n            Del(84),\n            Del(6),\n            Set,\n            Del(191),\n            Set,\n            Set,\n            Set,\n            Del(86),\n            Del(143),\n            Del(168),\n            Del(175),\n            Set,\n            Restart,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Restart,\n            Del(14),\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Del(60),\n            Set,\n            Del(115),\n            Restart,\n            Set,\n            Del(203),\n            Del(12),\n            Del(134),\n            Del(118),\n            Del(26),\n            Del(161),\n            Set,\n            Del(6),\n            Del(23),\n            Set,\n            Del(122),\n            Del(251),\n            Set,\n            Restart,\n            Set,\n            Set,\n            Del(252),\n            Del(88),\n            Set,\n            Del(140),\n            Del(164),\n            Del(203),\n            Del(165),\n            Set,\n            Set,\n            Restart,\n            Del(0),\n            Set,\n            Del(146),\n            Del(83),\n            Restart,\n            Del(0),\n            Set,\n            Del(55),\n            Set,\n            Set,\n            Del(89),\n            Set,\n            Set,\n            Del(105),\n            Restart,\n            Set,\n            Restart,\n            Del(145),\n            Set,\n            Del(17),\n            Del(123),\n            Set,\n            Del(203),\n            Set,\n            Set,\n            Set,\n            Set,\n            Del(192),\n            Del(58),\n            Restart,\n            Set,\n            Restart,\n            Set,\n            Restart,\n            Set,\n            Del(142),\n            Set,\n            Del(220),\n            Del(185),\n            Set,\n            Del(86),\n            Set,\n            Set,\n            Del(123),\n            Set,\n            Restart,\n            Del(56),\n            Del(191),\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Del(123),\n            Set,\n            Set,\n            Set,\n            Restart,\n            Del(20),\n            Del(47),\n            Del(207),\n            Del(45),\n            Set,\n            Set,\n            Set,\n            Del(83),\n            Set,\n            Del(92),\n            Del(117),\n            Set,\n            Set,\n            Restart,\n            Del(241),\n            Set,\n            Del(49),\n            Set,\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_11() {\n    // dupe lsn detected\n    // postmortem 1:\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Restart,\n            Del(21),\n            Set,\n            Set,\n            FailPoint(\"buffer write post\", 0xFFFFFFFFFFFFFFFF),\n            Set,\n            Set,\n            Restart,\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_12() {\n    // postmortem 1: we were not sorting the recovery state, which\n    // led to divergent state across recoveries.\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Set,\n            Del(0),\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Restart,\n            Set,\n            Set,\n            Set,\n            Restart,\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_13() {\n    // postmortem 1:\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Del(0),\n            Set,\n            Set,\n            Set,\n            Del(2),\n            Set,\n            Set,\n            Set,\n            Set,\n            Del(1),\n            Del(3),\n            Del(18),\n            Set,\n            Set,\n            Set,\n            Restart,\n            Set,\n            Set,\n            Set,\n            Set,\n            FailPoint(\"snap write\", 0xFFFFFFFFFFFFFFFF),\n            Del(4),\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_14() {\n    // postmortem 1: improper bounds on splits caused a loop to happen\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            FailPoint(\"blob blob write\", 0xFFFFFFFFFFFFFFFF),\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_15() {\n    // postmortem 1:\n    assert!(prop_tree_crashes_nicely(\n        vec![FailPoint(\"buffer write\", 0xFFFFFFFFFFFFFFFF), Id, Restart, Id],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_16() {\n    // postmortem 1:\n    assert!(prop_tree_crashes_nicely(\n        vec![FailPoint(\"zero garbage segment\", 0xFFFFFFFFFFFFFFFF), Id, Id],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_17() {\n    // postmortem 1: during recovery we were not properly\n    // filtering replaced pages in segments by the source\n    // segment still\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Del(0),\n            Set,\n            Set,\n            Set,\n            Del(3),\n            Id,\n            Id,\n            Set,\n            Id,\n            Id,\n            Del(3),\n            Id,\n            Id,\n            Del(3),\n            Restart,\n            Id,\n            FailPoint(\"blob blob write\", 0xFFFFFFFFFFFFFFFF),\n            Id,\n            Restart,\n            Id,\n            Set,\n            Id,\n            Del(3),\n            Set\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_18() {\n    // postmortem 1:\n    assert!(prop_tree_crashes_nicely(\n        vec![Id, Id, Set, Id, Id, Id, Set, Del(0), Restart, Del(0), Id, Set],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_19() {\n    // postmortem 1:\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Set,\n            Set,\n            Set,\n            Set,\n            Del(4),\n            Id,\n            Del(4),\n            Id,\n            Id,\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Id,\n            Set,\n            Set,\n            Del(11),\n            Del(13),\n            Id,\n            Del(122),\n            Del(134),\n            Del(101),\n            Del(81),\n            Set,\n            Del(15),\n            Del(76),\n            Restart,\n            Set,\n            Id,\n            Id,\n            Set,\n            Restart\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_20() {\n    // postmortem 1: failed to filter out segments with\n    // uninitialized segment ID's when creating a segment\n    // iterator.\n    assert!(prop_tree_crashes_nicely(\n        vec![Restart, Set, Set, Del(0), Id, Id, Set, Del(0), Id, Set],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_21() {\n    // postmortem 1:\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Id,\n            Del(242),\n            Set,\n            Del(172),\n            Id,\n            Del(142),\n            Del(183),\n            Set,\n            Set,\n            Set,\n            Set,\n            Set,\n            Id,\n            Id,\n            Set,\n            Id,\n            Set,\n            Id,\n            Del(187),\n            Set,\n            Id,\n            Set,\n            Id,\n            Del(152),\n            Del(231),\n            Del(45),\n            Del(181),\n            Restart,\n            Id,\n            Id,\n            Id,\n            Id,\n            Id,\n            Set,\n            Del(53),\n            Restart,\n            Set,\n            Del(202),\n            Id,\n            Set,\n            Set,\n            Set,\n            Id,\n            Restart,\n            Del(99),\n            Set,\n            Set,\n            Id,\n            Restart,\n            Del(93),\n            Id,\n            Set,\n            Del(38),\n            Id,\n            Del(158),\n            Del(49),\n            Id,\n            Del(145),\n            Del(35),\n            Set,\n            Del(94),\n            Del(115),\n            Id,\n            Restart,\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_22() {\n    // postmortem 1:\n    assert!(prop_tree_crashes_nicely(\n        vec![Id, FailPoint(\"buffer write\", 0xFFFFFFFFFFFFFFFF), Set, Id],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_23() {\n    // postmortem 1: failed to handle allocation failures\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Set,\n            FailPoint(\"blob blob write\", 0xFFFFFFFFFFFFFFFF),\n            Set,\n            Set,\n            Set\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_24() {\n    // postmortem 1: was incorrectly setting global\n    // errors, and they were being used-after-free\n    assert!(prop_tree_crashes_nicely(\n        vec![FailPoint(\"buffer write\", 0xFFFFFFFFFFFFFFFF), Id,],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_25() {\n    // postmortem 1: after removing segment trailers, we\n    // no longer have the invariant that a write\n    // must be more than one byte\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Del(103),\n            Restart,\n            Del(242),\n            Del(125),\n            Restart,\n            Set,\n            Restart,\n            Id,\n            Del(183),\n            Id,\n            FailPoint(\"snap write crc\", 0xFFFFFFFFFFFFFFFF),\n            Del(141),\n            Del(8),\n            Del(188),\n            Set,\n            Set,\n            Restart,\n            Id,\n            Id,\n            Id,\n            Set,\n            Id,\n            Id,\n            Set,\n            Del(65),\n            Del(6),\n            Del(198),\n            Del(57),\n            Id,\n            FailPoint(\"snap write mv\", 0xFFFFFFFFFFFFFFFF),\n            Set,\n            Del(164),\n            Del(43),\n            Del(161),\n            Id,\n            Restart,\n            Set,\n            Id,\n            Id,\n            Set,\n            Set,\n            Restart,\n            Restart,\n            Set,\n            Set,\n            Del(252),\n            Set,\n            Del(111),\n            Id,\n            Del(55)\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_26() {\n    // postmortem 1: after removing segment trailers, we\n    // no longer handled maxed segment recovery properly\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Id,\n            Set,\n            Set,\n            Del(167),\n            Del(251),\n            Del(24),\n            Set,\n            Del(111),\n            Id,\n            Del(133),\n            Del(187),\n            Restart,\n            Set,\n            Del(52),\n            Set,\n            Restart,\n            Set,\n            Set,\n            Id,\n            Set,\n            Set,\n            Id,\n            Id,\n            Set,\n            Set,\n            Del(95),\n            Set,\n            Id,\n            Del(59),\n            Del(133),\n            Del(209),\n            Id,\n            Del(89),\n            Id,\n            Set,\n            Del(46),\n            Set,\n            Del(246),\n            Restart,\n            Set,\n            Restart,\n            Restart,\n            Del(28),\n            Set,\n            Del(9),\n            Del(101),\n            Id,\n            Del(73),\n            Del(192),\n            Set,\n            Set,\n            Set,\n            Id,\n            Set,\n            Set,\n            Set,\n            Id,\n            Restart,\n            Del(92),\n            Del(212),\n            Del(215)\n        ],\n        false,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_27() {\n    // postmortem 1: a segment is recovered as empty at recovery,\n    // which prevented its lsn from being known, and when the SA\n    // was recovered it erroneously calculated its lsn as being -1\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Id,\n            Id,\n            Set,\n            Set,\n            Restart,\n            Set,\n            Id,\n            Id,\n            Set,\n            Del(197),\n            Del(148),\n            Restart,\n            Id,\n            Set,\n            Del(165),\n            Set,\n            Set,\n            Set,\n            Set,\n            Id,\n            Del(29),\n            Set,\n            Set,\n            Del(75),\n            Del(170),\n            Restart,\n            Restart,\n            Set\n        ],\n        true,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_28() {\n    // postmortem 1:\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Del(61),\n            Id,\n            Del(127),\n            Set,\n            Restart,\n            Del(219),\n            Id,\n            Set,\n            Id,\n            Del(41),\n            Id,\n            Id,\n            Set,\n            Del(227),\n            Set,\n            Del(191),\n            Id,\n            Del(78),\n            Set,\n            Id,\n            Set,\n            Del(123),\n            Restart,\n            Restart,\n            Restart,\n            Id\n        ],\n        true,\n    ))\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_29() {\n    // postmortem 1: the test model was turning uncertain entries\n    // into certain entries even when there was an intervening crash\n    // between the Set and the Flush\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            FailPoint(\"buffer write\", 0xFFFFFFFFFFFFFFFF),\n            Set,\n            Flush,\n            Restart\n        ],\n        false,\n    ));\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Set,\n            Set,\n            Set,\n            FailPoint(\"snap write mv\", 0xFFFFFFFFFFFFFFFF),\n            Set,\n            Flush,\n            Restart\n        ],\n        false,\n    ));\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_30() {\n    // postmortem 1:\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Set,\n            FailPoint(\"buffer write\", 0xFFFFFFFFFFFFFFFF),\n            Restart,\n            Flush,\n            Id\n        ],\n        false,\n    ));\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_31() {\n    // postmortem 1: apply_batch_inner drops a RecoveryGuard, which in turn\n    // drops a Reservation, and Reservation's drop implementation flushes\n    // itself and unwraps the Result returned, which has the FailPoint error\n    // in it\n    for _ in 0..10 {\n        assert!(prop_tree_crashes_nicely(\n            vec![\n                Del(0),\n                FailPoint(\"snap write\", 0xFFFFFFFFFFFFFFFF),\n                Batched(vec![])\n            ],\n            true,\n        ));\n    }\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_32() {\n    // postmortem 1:\n    for _ in 0..10 {\n        assert!(prop_tree_crashes_nicely(\n            vec![Batched(vec![BatchOp::Set, BatchOp::Set]), Restart],\n            false\n        ));\n    }\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_33() {\n    // postmortem 1:\n    assert!(prop_tree_crashes_nicely(\n        vec![\n            Batched(vec![\n                BatchOp::Set,\n                BatchOp::Set,\n                BatchOp::Del(85),\n                BatchOp::Set,\n                BatchOp::Set,\n                BatchOp::Set,\n                BatchOp::Del(148),\n                BatchOp::Set,\n                BatchOp::Set\n            ]),\n            Restart,\n            Batched(vec![\n                BatchOp::Del(255),\n                BatchOp::Del(42),\n                BatchOp::Del(150),\n                BatchOp::Del(16),\n                BatchOp::Set,\n                BatchOp::Set,\n                BatchOp::Set,\n                BatchOp::Set,\n                BatchOp::Set,\n                BatchOp::Del(111),\n                BatchOp::Del(65),\n                BatchOp::Del(102),\n                BatchOp::Del(99),\n                BatchOp::Del(25),\n                BatchOp::Del(156),\n                BatchOp::Set,\n                BatchOp::Set,\n                BatchOp::Set,\n                BatchOp::Del(73),\n                BatchOp::Set,\n                BatchOp::Set,\n                BatchOp::Set,\n                BatchOp::Set,\n                BatchOp::Del(238),\n                BatchOp::Del(211),\n                BatchOp::Del(14),\n                BatchOp::Del(7),\n                BatchOp::Del(137),\n                BatchOp::Del(115),\n                BatchOp::Del(91),\n                BatchOp::Set,\n                BatchOp::Del(172),\n                BatchOp::Del(49),\n                BatchOp::Del(152),\n                BatchOp::Set,\n                BatchOp::Del(189),\n                BatchOp::Set,\n                BatchOp::Del(37),\n                BatchOp::Set,\n                BatchOp::Set,\n                BatchOp::Del(96),\n                BatchOp::Set,\n                BatchOp::Set,\n                BatchOp::Del(159),\n                BatchOp::Del(126)\n            ])\n        ],\n        false\n    ));\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_34() {\n    // postmortem 1: the implementation of make_durable was not properly\n    // exiting the function when local durability was detected\n    use BatchOp::*;\n    assert!(prop_tree_crashes_nicely(\n        vec![Batched(vec![\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n            Set, Set, Set, Set, Set, Set,\n        ])],\n        false\n    ));\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_35() {\n    // postmortem 1:\n    use BatchOp::*;\n    for _ in 0..50 {\n        assert!(prop_tree_crashes_nicely(\n            vec![\n                Batched(vec![Del(106), Set, Del(32), Del(149), Set]),\n                Flush,\n                Batched(vec![Del(136), Set, Set, Del(61), Set, Del(202)]),\n                Flush,\n                Batched(vec![Del(106), Set, Del(32), Del(149), Set]),\n                Flush,\n                Batched(vec![Del(136), Set, Set, Del(61), Set, Del(202)]),\n                Flush,\n                Batched(vec![Del(106), Set, Del(32), Del(149), Set]),\n                Flush,\n                Batched(vec![Del(136), Set, Set, Del(61), Set, Del(202)]),\n                Flush,\n            ],\n            true\n        ));\n    }\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_36() {\n    // postmortem 1: in `Tree::attempt_fmt` a result from the pagecache\n    // was asserted on rather than propagated, which caused failpoints\n    // to turn into panics.\n    use BatchOp::*;\n    for _ in 0..50 {\n        assert!(prop_tree_crashes_nicely(\n            vec![\n                Op::Batched(vec![\n                    Set,\n                    Set,\n                    Del(203),\n                    Set,\n                    Del(14),\n                    Set,\n                    Set,\n                    Set,\n                    Del(209),\n                    Set,\n                    Set,\n                    Set,\n                    Set,\n                    Set\n                ]),\n                Op::Set,\n                Op::Restart,\n                Op::FailPoint(\"buffer write post\", 17420268517488604084),\n                Op::Restart\n            ],\n            true\n        ))\n    }\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_37() {\n    // postmortem 1: global errors were not being properly set\n    use BatchOp::*;\n    for _ in 0..100 {\n        assert!(prop_tree_crashes_nicely(\n            vec![\n                Op::Batched(vec![Set, Set, Set, Set, Set, Set, Set]),\n                Op::FailPoint(\"pwrite\", 13605093379298630254),\n                Op::Restart,\n            ],\n            false\n        ))\n    }\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_38() {\n    // postmortem 1: global errors were not being properly set\n    use BatchOp::*;\n    for _ in 0..100 {\n        assert!(prop_tree_crashes_nicely(\n            vec![\n                Op::Batched(vec![\n                    Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set, Set,\n                ]),\n                Op::FailPoint(\"pwrite partial\", 18422008228777642734),\n                Op::Set,\n            ],\n            false\n        ))\n    }\n}\n\n#[test]\n#[cfg_attr(miri, ignore)]\nfn failpoints_bug_39() {\n    // postmortem 1:\n    use BatchOp::*;\n    for i in 0..100 {\n        assert!(prop_tree_crashes_nicely(\n            vec![Op::Batched(vec![Set; i % 50]), Op::Restart],\n            true\n        ))\n    }\n}\n"
  },
  {
    "path": "tests/tree/mod.rs",
    "content": "use std::{collections::BTreeMap, convert::TryInto, fmt, panic};\n\nuse quickcheck::{Arbitrary, Gen};\nuse rand_distr::{Distribution, Gamma};\n\nuse sled::{Config, Db as SledDb, InlineArray};\n\ntype Db = SledDb<3>;\n\n#[derive(Clone, Ord, PartialOrd, Eq, PartialEq)]\npub struct Key(pub Vec<u8>);\n\nimpl fmt::Debug for Key {\n    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {\n        if !self.0.is_empty() {\n            write!(\n                f,\n                \"Key(vec![{}; {}])\",\n                self.0.first().copied().unwrap_or(0),\n                self.0.len()\n            )\n        } else {\n            write!(f, \"Key(vec!{:?})\", self.0)\n        }\n    }\n}\n\nfn range(g: &mut Gen, min_inclusive: usize, max_exclusive: usize) -> usize {\n    assert!(max_exclusive > min_inclusive);\n    let range = max_exclusive - min_inclusive;\n    let generated = usize::arbitrary(g) % range;\n    min_inclusive + generated\n}\n\nimpl Arbitrary for Key {\n    #![allow(clippy::cast_possible_truncation)]\n    #![allow(clippy::cast_precision_loss)]\n    #![allow(clippy::cast_sign_loss)]\n\n    fn arbitrary(g: &mut Gen) -> Self {\n        if bool::arbitrary(g) {\n            let gs = g.size();\n            let gamma = Gamma::new(0.3, gs as f64).unwrap();\n            let v = gamma.sample(&mut rand::rng());\n            let len = if v > 3000.0 { 10000 } else { (v % 300.) as usize };\n\n            let space = range(g, 0, gs) + 1;\n\n            let inner = (0..len).map(|_| range(g, 0, space) as u8).collect();\n\n            Self(inner)\n        } else {\n            let len = range(g, 0, 2);\n            let mut inner = vec![];\n\n            for _ in 0..len {\n                inner.push(u8::arbitrary(g));\n            }\n\n            Self(inner)\n        }\n    }\n\n    fn shrink(&self) -> Box<dyn Iterator<Item = Self>> {\n        // we only want to shrink on length, not byte values\n        Box::new(\n            self.0\n                .len()\n                .shrink()\n                .zip(std::iter::repeat(self.0.clone()))\n                .map(|(len, underlying)| Self(underlying[..len].to_vec())),\n        )\n    }\n}\n\n#[derive(Debug, Clone)]\npub enum Op {\n    Set(Key, u8),\n    // Merge(Key, u8),\n    Get(Key),\n    GetLt(Key),\n    GetGt(Key),\n    Del(Key),\n    Cas(Key, u8, u8),\n    Scan(Key, isize),\n    Restart,\n}\n\nuse self::Op::*;\n\nimpl Arbitrary for Op {\n    fn arbitrary(g: &mut Gen) -> Self {\n        if range(g, 0, 10) == 0 {\n            return Restart;\n        }\n\n        let choice = range(g, 0, 7);\n\n        match choice {\n            0 => Set(Key::arbitrary(g), u8::arbitrary(g)),\n            1 => Get(Key::arbitrary(g)),\n            2 => GetLt(Key::arbitrary(g)),\n            3 => GetGt(Key::arbitrary(g)),\n            4 => Del(Key::arbitrary(g)),\n            5 => Cas(Key::arbitrary(g), u8::arbitrary(g), u8::arbitrary(g)),\n            6 => Scan(Key::arbitrary(g), range(g, 0, 80) as isize - 40),\n            //7 => Merge(Key::arbitrary(g), u8::arbitrary(g)),\n            _ => panic!(\"impossible choice\"),\n        }\n    }\n\n    fn shrink(&self) -> Box<dyn Iterator<Item = Self>> {\n        match *self {\n            Set(ref k, v) => Box::new(k.shrink().map(move |sk| Set(sk, v))),\n            /*\n            Merge(ref k, v) => Box::new(\n                k.shrink()\n                    .flat_map(move |k| vec![Set(k.clone(), v), Merge(k, v)]),\n            ),\n            */\n            Get(ref k) => Box::new(k.shrink().map(Get)),\n            GetLt(ref k) => Box::new(k.shrink().map(GetLt)),\n            GetGt(ref k) => Box::new(k.shrink().map(GetGt)),\n            Cas(ref k, old, new) => {\n                Box::new(k.shrink().map(move |k| Cas(k, old, new)))\n            }\n            Scan(ref k, len) => Box::new(k.shrink().map(move |k| Scan(k, len))),\n            Del(ref k) => Box::new(k.shrink().map(Del)),\n            Restart => Box::new(vec![].into_iter()),\n        }\n    }\n}\n\nfn bytes_to_u16(v: &[u8]) -> u16 {\n    assert_eq!(v.len(), 2);\n    (u16::from(v[0]) << 8) + u16::from(v[1])\n}\n\nfn u16_to_bytes(u: u16) -> Vec<u8> {\n    u.to_be_bytes().to_vec()\n}\n\n/*\n// just adds up values as if they were u16's\nfn merge_operator(\n    _k: &[u8],\n    old: Option<&[u8]>,\n    to_merge: &[u8],\n) -> Option<Vec<u8>> {\n    let base = old.unwrap_or(&[0, 0]);\n    let base_n = bytes_to_u16(base);\n    let new_n = base_n + u16::from(to_merge[0]);\n    let ret = u16_to_bytes(new_n);\n    Some(ret)\n}\n*/\n\npub fn prop_tree_matches_btreemap(\n    ops: Vec<Op>,\n    flusher: bool,\n    compression_level: i32,\n    cache_size: usize,\n) -> bool {\n    if let Err(e) = prop_tree_matches_btreemap_inner(\n        ops,\n        flusher,\n        compression_level,\n        cache_size,\n    ) {\n        eprintln!(\"hit error while running quickcheck on tree: {:?}\", e);\n        false\n    } else {\n        true\n    }\n}\n\nfn prop_tree_matches_btreemap_inner(\n    ops: Vec<Op>,\n    flusher: bool,\n    compression: i32,\n    cache_size: usize,\n) -> std::io::Result<()> {\n    use self::*;\n\n    super::common::setup_logger();\n\n    let config = Config::tmp()?\n        .zstd_compression_level(compression)\n        .flush_every_ms(if flusher { Some(1) } else { None })\n        .cache_capacity_bytes(cache_size);\n\n    let mut tree: Db = config.open().unwrap();\n    //tree.set_merge_operator(merge_operator);\n\n    let mut reference: BTreeMap<Key, u16> = BTreeMap::new();\n\n    for op in ops {\n        match op {\n            Set(k, v) => {\n                let old_actual = tree.insert(&k.0, vec![0, v]).unwrap();\n                let old_reference = reference.insert(k.clone(), u16::from(v));\n                assert_eq!(\n                    old_actual.map(|v| bytes_to_u16(&*v)),\n                    old_reference,\n                    \"when setting key {:?}, expected old returned value to be {:?}\\n{:?}\",\n                    k,\n                    old_reference,\n                    tree\n                );\n            }\n            /*\n            Merge(k, v) => {\n                tree.merge(&k.0, vec![v]).unwrap();\n                let entry = reference.entry(k).or_insert(0_u16);\n                *entry += u16::from(v);\n            }\n            */\n            Get(k) => {\n                let res1 = tree.get(&*k.0).unwrap().map(|v| bytes_to_u16(&*v));\n                let res2 = reference.get(&k).cloned();\n                assert_eq!(res1, res2);\n            }\n            GetLt(k) => {\n                let res1 = tree.get_lt(&*k.0).unwrap().map(|v| v.0);\n                let res2 = reference\n                    .iter()\n                    .rev()\n                    .find(|(key, _)| **key < k)\n                    .map(|(k, _v)| InlineArray::from(&*k.0));\n                assert_eq!(\n                    res1, res2,\n                    \"get_lt({:?}) should have returned {:?} \\\n                     but it returned {:?} instead. \\\n                     \\n Db: {:?}\",\n                    k, res2, res1, tree\n                );\n            }\n            GetGt(k) => {\n                let res1 = tree.get_gt(&*k.0).unwrap().map(|v| v.0);\n                let res2 = reference\n                    .iter()\n                    .find(|(key, _)| **key > k)\n                    .map(|(k, _v)| InlineArray::from(&*k.0));\n                assert_eq!(\n                    res1, res2,\n                    \"get_gt({:?}) expected {:?} in tree {:?}\",\n                    k, res2, tree\n                );\n            }\n            Del(k) => {\n                tree.remove(&*k.0).unwrap();\n                reference.remove(&k);\n            }\n            Cas(k, old, new) => {\n                let tree_old = tree.get(&*k.0).unwrap();\n                if let Some(old_tree) = tree_old {\n                    if old_tree == *vec![0, old] {\n                        tree.insert(&k.0, vec![0, new]).unwrap();\n                    }\n                }\n\n                let ref_old = reference.get(&k).cloned();\n                if ref_old == Some(u16::from(old)) {\n                    reference.insert(k, u16::from(new));\n                }\n            }\n            Scan(k, len) => {\n                if len > 0 {\n                    let mut tree_iter = tree\n                        .range(&*k.0..)\n                        .take(len.abs().try_into().unwrap())\n                        .map(Result::unwrap);\n                    let ref_iter = reference\n                        .iter()\n                        .filter(|&(rk, _rv)| *rk >= k)\n                        .take(len.abs().try_into().unwrap())\n                        .map(|(rk, rv)| (rk.0.clone(), *rv));\n\n                    for r in ref_iter {\n                        let tree_next = tree_iter\n                            .next()\n                            .expect(\"iterator incorrectly stopped early\");\n                        let lhs = (tree_next.0, &*tree_next.1);\n                        let rhs = (r.0.clone(), &*u16_to_bytes(r.1));\n                        assert_eq!(\n                            (lhs.0.as_ref(), lhs.1),\n                            (rhs.0.as_ref(), rhs.1),\n                            \"expected {:?} while iterating from {:?} on tree: {:?}\",\n                            rhs,\n                            k,\n                            tree\n                        );\n                    }\n\n                    assert!(tree_iter.next().is_none());\n                } else {\n                    let mut tree_iter = tree\n                        .range(&*k.0..)\n                        .rev()\n                        .take(len.abs().try_into().unwrap())\n                        .map(Result::unwrap);\n                    let ref_iter = reference\n                        .iter()\n                        .rev()\n                        .filter(|&(rk, _rv)| *rk >= k)\n                        .take(len.abs().try_into().unwrap())\n                        .map(|(rk, rv)| (rk.0.clone(), *rv));\n\n                    for r in ref_iter {\n                        let tree_next = tree_iter.next().unwrap();\n                        let lhs = (tree_next.0, &*tree_next.1);\n                        let rhs = (r.0.clone(), &*u16_to_bytes(r.1));\n                        assert_eq!(\n                            (lhs.0.as_ref(), lhs.1),\n                            (rhs.0.as_ref(), rhs.1),\n                            \"expected {:?} while reverse iterating from {:?} on tree: {:?}\",\n                            rhs,\n                            k,\n                            tree\n                        );\n                    }\n\n                    assert!(tree_iter.next().is_none());\n                }\n            }\n            Restart => {\n                drop(tree);\n                tree = config.open().unwrap();\n                //tree.set_merge_operator(merge_operator);\n            }\n        }\n        if let Err(e) = tree.check_error() {\n            eprintln!(\"quickcheck test encountered error: {:?}\", e);\n            return Err(e);\n        }\n    }\n\n    let _ = std::fs::remove_dir_all(config.path);\n\n    tree.check_error()\n}\n"
  }
]