Repository: erikgrinaker/toydb
Branch: main
Commit: 473afbdb4aea
Files: 284
Total size: 1.4 MB

Directory structure:
gitextract_nc06cv1f/

├── .github/
│   └── workflows/
│       └── ci.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── README.md
├── cluster/
│   ├── run.sh
│   ├── toydb1/
│   │   └── toydb.yaml
│   ├── toydb2/
│   │   └── toydb.yaml
│   ├── toydb3/
│   │   └── toydb.yaml
│   ├── toydb4/
│   │   └── toydb.yaml
│   └── toydb5/
│       └── toydb.yaml
├── config/
│   └── toydb.yaml
├── docs/
│   ├── architecture/
│   │   ├── README.md
│   │   ├── client.md
│   │   ├── encoding.md
│   │   ├── index.md
│   │   ├── mvcc.md
│   │   ├── overview.md
│   │   ├── raft.md
│   │   ├── server.md
│   │   ├── sql-data.md
│   │   ├── sql-execution.md
│   │   ├── sql-optimizer.md
│   │   ├── sql-parser.md
│   │   ├── sql-planner.md
│   │   ├── sql-raft.md
│   │   ├── sql-storage.md
│   │   ├── sql.md
│   │   └── storage.md
│   ├── architecture.md
│   ├── crate/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   └── src/
│   │       └── lib.rs
│   ├── examples.md
│   ├── references.md
│   ├── sql.md
│   └── tools/
│       └── update-links.py
├── rust-toolchain
├── rustfmt.toml
├── src/
│   ├── bin/
│   │   ├── toydb.rs
│   │   ├── toydump.rs
│   │   ├── toysql.rs
│   │   └── workload.rs
│   ├── client.rs
│   ├── encoding/
│   │   ├── bincode.rs
│   │   ├── format.rs
│   │   ├── keycode.rs
│   │   └── mod.rs
│   ├── error.rs
│   ├── lib.rs
│   ├── raft/
│   │   ├── log.rs
│   │   ├── message.rs
│   │   ├── mod.rs
│   │   ├── node.rs
│   │   ├── state.rs
│   │   └── testscripts/
│   │       ├── log/
│   │       │   ├── append
│   │       │   ├── commit
│   │       │   ├── get
│   │       │   ├── has
│   │       │   ├── init
│   │       │   ├── scan
│   │       │   ├── scan_apply
│   │       │   ├── splice
│   │       │   ├── status
│   │       │   └── term
│   │       └── node/
│   │           ├── append
│   │           ├── append_base_missing
│   │           ├── append_base_missing_all
│   │           ├── append_commit_quorum
│   │           ├── append_initial
│   │           ├── append_max_entries
│   │           ├── append_pipeline
│   │           ├── append_probe_divergent_first
│   │           ├── append_probe_divergent_long
│   │           ├── append_probe_divergent_short
│   │           ├── append_probe_divergent_single
│   │           ├── append_response_beyond_last_index_panics
│   │           ├── append_response_stale_reject
│   │           ├── election
│   │           ├── election_candidate_behind_leader
│   │           ├── election_candidate_behind_quorum
│   │           ├── election_contested
│   │           ├── election_tie
│   │           ├── election_tie_even
│   │           ├── heartbeat_commits_follower
│   │           ├── heartbeat_converts_candidate
│   │           ├── heartbeat_converts_follower
│   │           ├── heartbeat_converts_follower_leaderless
│   │           ├── heartbeat_converts_leader
│   │           ├── heartbeat_lost_append_duplicate
│   │           ├── heartbeat_lost_append_multiple
│   │           ├── heartbeat_lost_append_single
│   │           ├── heartbeat_lost_read
│   │           ├── heartbeat_match_commits
│   │           ├── heartbeat_multiple_leaders_panic
│   │           ├── heartbeat_old_commit_index
│   │           ├── heartbeat_old_last_index
│   │           ├── heartbeat_probe_divergent
│   │           ├── old_campaign_rejected
│   │           ├── old_campaign_response_ignored
│   │           ├── old_heartbeat_ignored
│   │           ├── request_candidate_abort
│   │           ├── request_follower
│   │           ├── request_follower_campaign_abort
│   │           ├── request_follower_disconnect_stall
│   │           ├── request_follower_leaderless_abort
│   │           ├── request_leader
│   │           ├── request_leader_campaign_abort
│   │           ├── request_leader_change_linearizability
│   │           ├── request_leader_disconnect
│   │           ├── request_leader_read_quorum
│   │           ├── request_leader_read_quorum_sequence
│   │           ├── request_leader_single
│   │           ├── request_status
│   │           ├── request_status_single
│   │           ├── restart
│   │           ├── restart_apply
│   │           ├── restart_commit_recover
│   │           ├── restart_term_vote
│   │           ├── tick_candidate
│   │           ├── tick_follower
│   │           ├── tick_follower_leaderless
│   │           └── tick_leader
│   ├── server.rs
│   ├── sql/
│   │   ├── engine/
│   │   │   ├── engine.rs
│   │   │   ├── local.rs
│   │   │   ├── mod.rs
│   │   │   └── raft.rs
│   │   ├── execution/
│   │   │   ├── aggregator.rs
│   │   │   ├── executor.rs
│   │   │   ├── join.rs
│   │   │   ├── mod.rs
│   │   │   └── session.rs
│   │   ├── mod.rs
│   │   ├── parser/
│   │   │   ├── ast.rs
│   │   │   ├── lexer.rs
│   │   │   ├── mod.rs
│   │   │   └── parser.rs
│   │   ├── planner/
│   │   │   ├── mod.rs
│   │   │   ├── optimizer.rs
│   │   │   ├── plan.rs
│   │   │   └── planner.rs
│   │   ├── testscripts/
│   │   │   ├── expressions/
│   │   │   │   ├── cnf
│   │   │   │   ├── func
│   │   │   │   ├── func_sqrt
│   │   │   │   ├── literals
│   │   │   │   ├── op_compare_equal
│   │   │   │   ├── op_compare_greater
│   │   │   │   ├── op_compare_greater_equal
│   │   │   │   ├── op_compare_is_nan
│   │   │   │   ├── op_compare_is_null
│   │   │   │   ├── op_compare_lesser
│   │   │   │   ├── op_compare_lesser_equal
│   │   │   │   ├── op_compare_not_equal
│   │   │   │   ├── op_logic_and
│   │   │   │   ├── op_logic_not
│   │   │   │   ├── op_logic_or
│   │   │   │   ├── op_math_add
│   │   │   │   ├── op_math_divide
│   │   │   │   ├── op_math_exponentiate
│   │   │   │   ├── op_math_factorial
│   │   │   │   ├── op_math_identity
│   │   │   │   ├── op_math_multiply
│   │   │   │   ├── op_math_negate
│   │   │   │   ├── op_math_remainder
│   │   │   │   ├── op_math_subtract
│   │   │   │   ├── op_precedence
│   │   │   │   └── op_string_like
│   │   │   ├── optimizers/
│   │   │   │   ├── constant_folder
│   │   │   │   ├── filter_pushdown
│   │   │   │   ├── hash_join
│   │   │   │   ├── index_lookup
│   │   │   │   └── short_circuit
│   │   │   ├── queries/
│   │   │   │   ├── aggregate
│   │   │   │   ├── clauses
│   │   │   │   ├── group_by
│   │   │   │   ├── having
│   │   │   │   ├── join_cross
│   │   │   │   ├── join_inner
│   │   │   │   ├── join_outer
│   │   │   │   ├── limit
│   │   │   │   ├── offset
│   │   │   │   ├── order
│   │   │   │   ├── select
│   │   │   │   ├── where_
│   │   │   │   ├── where_index
│   │   │   │   └── where_primary_key
│   │   │   ├── schema/
│   │   │   │   ├── create_table
│   │   │   │   ├── create_table_datatypes
│   │   │   │   ├── create_table_default
│   │   │   │   ├── create_table_index
│   │   │   │   ├── create_table_names
│   │   │   │   ├── create_table_null
│   │   │   │   ├── create_table_primary_key
│   │   │   │   ├── create_table_reference
│   │   │   │   ├── create_table_transaction
│   │   │   │   ├── create_table_unique
│   │   │   │   ├── drop_table
│   │   │   │   ├── drop_table_index
│   │   │   │   ├── drop_table_ref
│   │   │   │   └── drop_table_transaction
│   │   │   ├── transactions/
│   │   │   │   ├── anomaly_dirty_read
│   │   │   │   ├── anomaly_dirty_write
│   │   │   │   ├── anomaly_fuzzy_read
│   │   │   │   ├── anomaly_lost_update
│   │   │   │   ├── anomaly_phantom_read
│   │   │   │   ├── anomaly_read_skew
│   │   │   │   ├── anomaly_write_skew
│   │   │   │   ├── begin
│   │   │   │   ├── commit
│   │   │   │   ├── isolation
│   │   │   │   ├── rollback
│   │   │   │   └── schema
│   │   │   └── writes/
│   │   │       ├── delete
│   │   │       ├── delete_index
│   │   │       ├── delete_reference
│   │   │       ├── delete_where
│   │   │       ├── insert
│   │   │       ├── insert_datatypes
│   │   │       ├── insert_default
│   │   │       ├── insert_index
│   │   │       ├── insert_null
│   │   │       ├── insert_primary_key
│   │   │       ├── insert_reference
│   │   │       ├── insert_unique
│   │   │       ├── update
│   │   │       ├── update_datatypes
│   │   │       ├── update_default
│   │   │       ├── update_expression
│   │   │       ├── update_index
│   │   │       ├── update_null
│   │   │       ├── update_primary_key
│   │   │       ├── update_reference
│   │   │       ├── update_unique
│   │   │       └── update_where
│   │   └── types/
│   │       ├── expression.rs
│   │       ├── mod.rs
│   │       ├── schema.rs
│   │       └── value.rs
│   └── storage/
│       ├── bitcask.rs
│       ├── engine.rs
│       ├── memory.rs
│       ├── mod.rs
│       ├── mvcc.rs
│       └── testscripts/
│           ├── bitcask/
│           │   ├── compact
│           │   ├── compact_open
│           │   ├── log
│           │   └── status
│           ├── engine/
│           │   ├── keys
│           │   ├── point
│           │   ├── scan
│           │   └── scan_prefix
│           ├── memory/
│           │   └── status
│           └── mvcc/
│               ├── anomaly_dirty_read
│               ├── anomaly_dirty_write
│               ├── anomaly_fuzzy_read
│               ├── anomaly_lost_update
│               ├── anomaly_phantom_read
│               ├── anomaly_read_skew
│               ├── anomaly_write_skew
│               ├── bank
│               ├── begin
│               ├── begin_as_of
│               ├── begin_readonly
│               ├── delete
│               ├── delete_conflict
│               ├── get
│               ├── get_isolation
│               ├── resume
│               ├── rollback
│               ├── scan
│               ├── scan_isolation
│               ├── scan_key_version_encoding
│               ├── scan_prefix
│               ├── set
│               ├── set_conflict
│               └── unversioned
└── tests/
    ├── scripts/
    │   ├── anomalies
    │   ├── client
    │   ├── errors
    │   ├── isolation
    │   └── queries
    ├── testcluster.rs
    └── tests.rs

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/ci.yml
================================================
name: CI
on: [push, pull_request, workflow_dispatch]
permissions:
  contents: read

jobs:
  test:
    name: Test
    runs-on: ubuntu-latest
    timeout-minutes: 10

    steps:
    - uses: actions/checkout@v3
    - uses: dtolnay/rust-toolchain@1.93.1
      id: toolchain
      with:
        components: clippy, rustfmt
    - uses: actions/cache@v3
      with:
        path: target
        key: ${{runner.os}}-target-${{steps.toolchain.outputs.cachekey}}-${{hashFiles('Cargo.lock')}}
    - run: cargo build --bins --tests
    - run: cargo test
    - run: cargo clippy --tests --no-deps -- -D warnings
    - run: cargo fmt --check
    - run: cargo doc --no-deps 
      env:
        RUSTDOCFLAGS: -D warnings

================================================
FILE: .gitignore
================================================
/cluster/toydb*/data
/data
/docs/crate/target
/target
.DS_Store
.vscode/
**/*.rs.bk


================================================
FILE: Cargo.toml
================================================
[package]
name = "toydb"
version = "1.0.0"
description = "A simple distributed SQL database, built for education"
authors = ["Erik Grinaker <erik@grinaker.org>"]
license = "Apache-2.0"
homepage = "https://github.com/erikgrinaker/toydb"
repository = "https://github.com/erikgrinaker/toydb"
edition = "2024"
default-run = "toydb"
publish = false

[lib]
doctest = false

[dependencies]
bincode = { version = "2.0", features = ["serde"] }
clap = { version = "4.5", features = ["cargo", "derive"] }
config = "0.15"
crossbeam = { version = "0.8", features = ["crossbeam-channel"] }
dyn-clone = "1.0"
fs4 = "0.13"
hdrhistogram = "7.5"
itertools = "0.14"
log = "0.4"
petname = "2.0.2"
rand = "0.10"
regex = "1.12"
rustyline = "17.0"
rustyline-derive = "0.11"
serde = { version = "1.0", features = ["derive"] }
serde_bytes = "0.11"
simplelog = "0.12"
uuid = { version = "1.21", features = ["serde", "v4"] }

[dev-dependencies]
escargot = "0.5"
goldenscript = "0.7"
hex = "0.4"
paste = "1.0"
serde_json = "1.0"
tempfile = "3.25"
test-case = "3.3"
test_each_file = "0.3"


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

================================================
FILE: README.md
================================================
# <a><img src="./docs/architecture/images/toydb.svg" height="40" valign="top" /></a> toyDB

Distributed SQL database in Rust, built from scratch as an educational project. Main features:

* [Raft distributed consensus][raft] for linearizable state machine replication.

* [ACID transactions][txn] with MVCC-based snapshot isolation.

* [Pluggable storage engine][storage] with [BitCask][bitcask] and [in-memory][memory] backends.

* [Iterator-based query engine][query] with [heuristic optimization][optimizer] and time-travel 
  support.

* [SQL interface][sql] including joins, aggregates, and transactions.

toyDB is intended to be simple and understandable, and also functional and correct. Other aspects
like performance, scalability, and availability are non-goals -- these are major sources of
complexity in production-grade databases, and obscure the basic underlying concepts. Shortcuts have
been taken where possible.

I originally wrote toyDB in 2020 to learn more about database internals. Since then, I've spent
several years building real distributed SQL databases at
[CockroachDB](https://github.com/cockroachdb/cockroach) and
[Neon](https://github.com/neondatabase/neon). Based on this experience, I've rewritten toyDB as a
simple illustration of the architecture and concepts behind distributed SQL databases.

[raft]: https://github.com/erikgrinaker/toydb/blob/main/src/raft/mod.rs
[txn]: https://github.com/erikgrinaker/toydb/blob/main/src/storage/mvcc.rs
[storage]: https://github.com/erikgrinaker/toydb/blob/main/src/storage/engine.rs
[bitcask]: https://github.com/erikgrinaker/toydb/blob/main/src/storage/bitcask.rs
[memory]: https://github.com/erikgrinaker/toydb/blob/main/src/storage/memory.rs
[query]: https://github.com/erikgrinaker/toydb/blob/main/src/sql/execution/executor.rs
[optimizer]: https://github.com/erikgrinaker/toydb/blob/main/src/sql/planner/optimizer.rs
[sql]: https://github.com/erikgrinaker/toydb/blob/main/src/sql/parser/parser.rs

## Documentation

* [Architecture guide](docs/architecture/index.md): a guided tour of toyDB's code and architecture.

* [SQL examples](docs/examples.md): walkthrough of toyDB's SQL features.

* [SQL reference](docs/sql.md): reference documentation for toyDB's SQL dialect.

* [References](docs/references.md): research materials used while building toyDB.

## Usage

With a [Rust compiler](https://www.rust-lang.org/tools/install) installed, a local five-node 
cluster can be built and started as:

```
$ ./cluster/run.sh
Starting 5 nodes on ports 9601-9605 with data under cluster/*/data/.
To connect to node 1, run: cargo run --release --bin toysql

toydb4 21:03:55 [INFO] Listening on [::1]:9604 (SQL) and [::1]:9704 (Raft)
toydb1 21:03:55 [INFO] Listening on [::1]:9601 (SQL) and [::1]:9701 (Raft)
toydb2 21:03:55 [INFO] Listening on [::1]:9602 (SQL) and [::1]:9702 (Raft)
toydb3 21:03:55 [INFO] Listening on [::1]:9603 (SQL) and [::1]:9703 (Raft)
toydb5 21:03:55 [INFO] Listening on [::1]:9605 (SQL) and [::1]:9705 (Raft)
toydb2 21:03:56 [INFO] Starting new election for term 1
[...]
toydb2 21:03:56 [INFO] Won election for term 1, becoming leader
```

A command-line client can be built and used with node 1 on `localhost:9601`:

```
$ cargo run --release --bin toysql
Connected to toyDB node n1. Enter !help for instructions.
toydb> CREATE TABLE movies (id INTEGER PRIMARY KEY, title VARCHAR NOT NULL);
toydb> INSERT INTO movies VALUES (1, 'Sicario'), (2, 'Stalker'), (3, 'Her');
toydb> SELECT * FROM movies;
1, 'Sicario'
2, 'Stalker'
3, 'Her'
```

toyDB supports most common SQL features, including joins, aggregates, and transactions. Below is an
`EXPLAIN` query plan of a more complex query (fetches all movies from studios that have released any
movie with an IMDb rating of 8 or more):

```
toydb> EXPLAIN SELECT m.title, g.name AS genre, s.name AS studio, m.rating
  FROM movies m JOIN genres g ON m.genre_id = g.id,
    studios s JOIN movies good ON good.studio_id = s.id AND good.rating >= 8
  WHERE m.studio_id = s.id
  GROUP BY m.title, g.name, s.name, m.rating, m.released
  ORDER BY m.rating DESC, m.released ASC, m.title ASC;

Remap: m.title, genre, studio, m.rating (dropped: m.released)
└─ Order: m.rating desc, m.released asc, m.title asc
   └─ Projection: m.title, g.name as genre, s.name as studio, m.rating, m.released
      └─ Aggregate: m.title, g.name, s.name, m.rating, m.released
         └─ HashJoin: inner on m.studio_id = s.id
            ├─ HashJoin: inner on m.genre_id = g.id
            │  ├─ Scan: movies as m
            │  └─ Scan: genres as g
            └─ HashJoin: inner on s.id = good.studio_id
               ├─ Scan: studios as s
               └─ Scan: movies as good (good.rating > 8 OR good.rating = 8)
```

## Architecture

toyDB's architecture is fairly typical for a distributed SQL database: a transactional
key/value store managed by a Raft cluster with a SQL query engine on top. See the
[architecture guide](./docs/architecture/index.md) for more details.

[![toyDB architecture](./docs/architecture/images/architecture.svg)](./docs/architecture/index.md)

## Tests

toyDB mainly uses [Goldenscripts](https://github.com/erikgrinaker/goldenscript) for tests. These 
script various scenarios, capture events and output, and later assert that the behavior remains the 
same. See e.g.:

* [Raft cluster tests](https://github.com/erikgrinaker/toydb/tree/main/src/raft/testscripts/node)
* [MVCC transaction tests](https://github.com/erikgrinaker/toydb/tree/main/src/storage/testscripts/mvcc)
* [SQL execution tests](https://github.com/erikgrinaker/toydb/tree/main/src/sql/testscripts)
* [End-to-end tests](https://github.com/erikgrinaker/toydb/tree/main/tests/scripts)

Run tests with `cargo test`, or have a look at the latest 
[CI run](https://github.com/erikgrinaker/toydb/actions/workflows/ci.yml).

## Benchmarks

toyDB is not optimized for performance, but comes with a `workload` benchmark tool that can run 
various workloads against a toyDB cluster. For example:

```sh
# Start a 5-node toyDB cluster.
$ ./cluster/run.sh
[...]

# Run a read-only benchmark via all 5 nodes.
$ cargo run --release --bin workload read
Preparing initial dataset... done (0.179s)
Spawning 16 workers... done (0.006s)
Running workload read (rows=1000 size=64 batch=1)...

Time   Progress     Txns      Rate       p50       p90       p99      pMax
1.0s      13.1%    13085   13020/s     1.3ms     1.5ms     1.9ms     8.4ms
2.0s      27.2%    27183   13524/s     1.3ms     1.5ms     1.8ms     8.4ms
3.0s      41.3%    41301   13702/s     1.2ms     1.5ms     1.8ms     8.4ms
4.0s      55.3%    55340   13769/s     1.2ms     1.5ms     1.8ms     8.4ms
5.0s      70.0%    70015   13936/s     1.2ms     1.5ms     1.8ms     8.4ms
6.0s      84.7%    84663   14047/s     1.2ms     1.4ms     1.8ms     8.4ms
7.0s      99.6%    99571   14166/s     1.2ms     1.4ms     1.7ms     8.4ms
7.1s     100.0%   100000   14163/s     1.2ms     1.4ms     1.7ms     8.4ms

Verifying dataset... done (0.002s)
```

The available workloads are:

* `read`: single-row primary key lookups.
* `write`: single-row inserts to sequential primary keys.
* `bank`: bank transfers between various customers and accounts. To make things interesting, this
  includes joins, secondary indexes, sorting, and conflicts.

For more information about workloads and parameters, run `cargo run --bin workload -- --help`.

Example workload results are listed below. Write performance is atrocious, due to
[fsync](https://en.wikipedia.org/wiki/Sync_(Unix)) and a lack of write batching in the Raft layer.
Disabling fsync, or using the in-memory engine, significantly improves write performance (at the
expense of durability).

| Workload | BitCask     | BitCask w/o fsync | Memory      |
|----------|-------------|-------------------|-------------|
| `read`   | 14163 txn/s | 13941 txn/s       | 13949 txn/s |
| `write`  | 35 txn/s    | 4719 txn/s        | 7781 txn/s  |
| `bank`   | 21 txn/s    | 1120 txn/s        | 1346 txn/s  |

## Debugging

[VSCode](https://code.visualstudio.com) and the [CodeLLDB](https://marketplace.visualstudio.com/items?itemName=vadimcn.vscode-lldb)
extension can be used to debug toyDB, with the debug configuration under `.vscode/launch.json`.

Under the "Run and Debug" tab, select e.g. "Debug executable 'toydb'" or "Debug unit tests in
library 'toydb'".

## Credits

The toyDB logo is courtesy of [@jonasmerlin](https://github.com/jonasmerlin).

================================================
FILE: cluster/run.sh
================================================
#!/usr/bin/env bash
#
# This script builds and runs a 5-node toyDB cluster listening on ports
# 9601-9605. Config and data is stored under the toydb* directories.
# To connect a toysql client to node 1 on port 9601, run:
#
# cargo run --release --bin toysql

set -euo pipefail

# Change into the script directory.
cd "$(dirname $0)"

# Build toyDB using release optimizations.
cargo build --release --bin toydb

# Start nodes 1-5 in the background, prefixing their output with the node ID.
echo "Starting 5 nodes on ports 9601-9605 with data under cluster/*/data/."
echo "To connect to node 1, run: cargo run --release --bin toysql"
echo ""

for ID in 1 2 3 4 5; do
    (cargo run -q --release -- -c toydb$ID/toydb.yaml 2>&1 | sed -e "s/\\(.*\\)/toydb$ID \\1/g") &
done

# Wait for the background processes to exit. Kill all toyDB processes when the
# script exits (e.g. via Ctrl-C).
trap 'kill -TERM -- -$$ 2>/dev/null' INT TERM EXIT
wait

================================================
FILE: cluster/toydb1/toydb.yaml
================================================
id: 1
data_dir: toydb1/data
listen_sql: localhost:9601
listen_raft: localhost:9701
peers:
  '2': localhost:9702
  '3': localhost:9703
  '4': localhost:9704
  '5': localhost:9705

================================================
FILE: cluster/toydb2/toydb.yaml
================================================
id: 2
data_dir: toydb2/data
listen_sql: localhost:9602
listen_raft: localhost:9702
peers:
  '1': localhost:9701
  '3': localhost:9703
  '4': localhost:9704
  '5': localhost:9705

================================================
FILE: cluster/toydb3/toydb.yaml
================================================
id: 3
data_dir: toydb3/data
listen_sql: localhost:9603
listen_raft: localhost:9703
peers:
  '1': localhost:9701
  '2': localhost:9702
  '4': localhost:9704
  '5': localhost:9705

================================================
FILE: cluster/toydb4/toydb.yaml
================================================
id: 4
data_dir: toydb4/data
listen_sql: localhost:9604
listen_raft: localhost:9704
peers:
  '1': localhost:9701
  '2': localhost:9702
  '3': localhost:9703
  '5': localhost:9705

================================================
FILE: cluster/toydb5/toydb.yaml
================================================
id: 5
data_dir: toydb5/data
listen_sql: localhost:9605
listen_raft: localhost:9705
peers:
  '1': localhost:9701
  '2': localhost:9702
  '3': localhost:9703
  '4': localhost:9704

================================================
FILE: config/toydb.yaml
================================================
# The node ID (must be unique in the cluster), and map of peer IDs and Raft
# addresses (empty for single node).
id: 1
peers: {}

# Addresses to listen for SQL and Raft connections on.
listen_sql: localhost:9601
listen_raft: localhost:9701

# The log level. Valid values are DEBUG, INFO, WARN, and ERROR.
log_level: INFO

# Node data directory. The Raft log is stored in the file "raft", and the SQL
# database in "sql".
data_dir: data

# Storage engine to use for the Raft log and SQL database.
#
# * bitcask (default): an append-only log-structured store.
# * memory: an in-memory store using the Rust standard library's BTreeMap.
storage_raft: bitcask
storage_sql: bitcask

# Whether to fsync writes to disk. Disabling this yields much better write
# performance, but may lose data on host crashes and violate Raft guarantees. It
# only affects Raft log writes (the SQL state machine is never fsynced since it
# can be reconstructed from the Raft log).
fsync: true

# The minimum garbage fraction and bytes to trigger Bitcask log compaction on
# node startup.
compact_threshold: 0.2
compact_min_bytes: 1000000

================================================
FILE: docs/architecture/README.md
================================================
See [`index.md`](index.md).

================================================
FILE: docs/architecture/client.md
================================================
# Client

The toyDB client is in the [`client`](https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs)
module. It uses the same Bincode-based protocol that we saw in the server section, sending
`toydb::Request` and receiving `toydb::Response`.

## Client Library

The main client library `toydb::Client` is used to communicate with a toyDB server:

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs#L15-L24

When initialized, it connects to a toyDB server over TCP, which establishes a SQL session for it:

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs#L27-L33

It can then send Bincode-encoded `toydb::Request` to the server, and receive `toydb::Response`
back.

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs#L35-L40


In particular, `Client::execute` can be used to execute arbitrary SQL statements in the client's
current session:

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/client.rs#L42-L56

## `toysql` Binary

However, `toydb::Client` is a programmatic API, and we want a more convenient user interface.
The `toysql` client in [`src/bin/toysql.rs`](https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs)
provides a typical [REPL](https://en.wikipedia.org/wiki/Read–eval–print_loop) (read-evaluate-print loop) where users can enter SQL statements and view the results.

Like `toydb`, `toysql` is a tiny [`clap`](https://docs.rs/clap/latest/clap/) command that takes a
toyDB server address to connect to and starts an interactive shell:

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L29-L53

It first attempts to connect to the toyDB server using the `toydb::Client` client, and then starts
an interactive shell using the [Rustyline](https://docs.rs/rustyline/latest/rustyline/) library.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L55-L81

The shell is simply a loop that prompts the user to input a SQL statement:

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L216-L250

Each statement is the executed against the server via `toydb::Client::execute`, and the response
is formatted and printed as output:

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L83-L92

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/bin/toysql.rs#L175-L204

And with that, we have a fully functional SQL database system and can run queries to our heart's
content. Have fun!

---

<p align="center">
← <a href="server.md">Server</a>
</p>

================================================
FILE: docs/architecture/encoding.md
================================================
# Key/Value Encoding

The key/value store uses binary `Vec<u8>` keys and values, so we need an encoding scheme to 
translate between in-memory Rust data structures and the on-disk binary data. This is provided by
the [`encoding`](https://github.com/erikgrinaker/toydb/tree/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/encoding)
module, with separate schemes for key and value encoding.

## `Bincode` Value Encoding

Values are encoded using [Bincode](https://github.com/bincode-org/bincode), a third-party binary
encoding scheme for Rust. Bincode is convenient because it can easily encode any arbitrary Rust
data type. But we could also have chosen e.g. [JSON](https://en.wikipedia.org/wiki/JSON),
[Protobuf](https://protobuf.dev), [MessagePack](https://msgpack.org/), or any other encoding.

We won't dwell on the actual binary format here, see the [Bincode specification](https://git.sr.ht/~stygianentity/bincode/tree/trunk/item/docs/spec.md)
for details.

To use a consistent configuration for all encoding and decoding, we provide helper functions in
the [`encoding::bincode`](https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/encoding/bincode.rs)
module which use `bincode::config::standard()`.

https://github.com/erikgrinaker/toydb/blob/0ce1fb34349fda043cb9905135f103bceb4395b4/src/encoding/bincode.rs#L15-L27

Bincode uses the very common [Serde](https://serde.rs) framework for its API. toyDB also provides an
`encoding::Value` helper trait for value types which adds automatic `encode()` and `decode()`
methods:

https://github.com/erikgrinaker/toydb/blob/b57ae6502e93ea06df00d94946a7304b7d60b977/src/encoding/mod.rs#L39-L68

Here's an example of how this can be used to encode and decode an arbitrary `Dog` data type:

```rust
#[derive(serde::Serialize, serde::Deserialize)]
struct Dog {
    name: String,
    age: u8,
    good_boy: bool,
}

impl encoding::Value for Dog {}

let pluto = Dog { name: "Pluto".into(), age: 4, good_boy: true };
let bytes = pluto.encode();
println!("{bytes:02x?}");

// Outputs [05, 50, 6c, 75, 74, 6f, 04, 01]:
//
// * Length of string "Pluto": 05.
// * String "Pluto": 50 6c 75 74 6f.
// * Age 4: 04.
// * Good boy: 01 (true).

let pluto = Dog::decode(&bytes)?; // gives us back Pluto
```

## `Keycode` Key Encoding

Unlike values, keys can't just use any binary encoding like Bincode. As mentioned in the storage
section, the storage engine sorts data by key to enable range scans. The key encoding must therefore
preserve the [lexicographical order](https://en.wikipedia.org/wiki/Lexicographic_order) of the
encoded values: the binary byte slices must sort in the same order as the original values.

As an example of why we can't just use Bincode, consider the strings "house" and "key". These should
be sorted in alphabetical order: "house" before "key". However, Bincode encodes strings prefixed by
their length, so "key" would be sorted before "house" in binary form:

```
03 6b 65 79        ← 3 bytes: key
05 68 6f 75 73 65  ← 5 bytes: house
```

For similar reasons, we can't just encode numbers in their native binary form: the
[little-endian](https://en.wikipedia.org/wiki/Endianness) representation will order very large
numbers before small numbers, and the [sign bit](https://en.wikipedia.org/wiki/Sign_bit) will order
positive numbers before negative numbers. This would violate the ordering of natural numbers.

We also have to be careful with value sequences, which should be ordered element-wise. For example,
the pair ("a", "xyz") should be ordered before ("ab", "cd"), so we can't just encode the strings
one after the other like "axyz" and "abcd" since that would sort ("ab", "cd") first.

toyDB provides an order-preserving encoding called "Keycode" in the [`encoding::keycode`](https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/encoding/keycode.rs)
module. Like Bincode, the Keycode encoding is not self-describing: the binary data does not say what
the data type is, the caller must provide a type to decode into. It only supports a handful of
primitive data types, and only needs to order values of the same type.

Keycode is implemented as a [Serde](https://serde.rs) (de)serializer, which requires a lot of
boilerplate code to satisfy the trait, but we'll just focus on the actual encoding. The encoding
scheme is as follows:

* `bool`: `00` for `false` and `01` for `true`.

    https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L113-L117

* `u64`: the [big-endian](https://en.wikipedia.org/wiki/Endianness) binary encoding.

    https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L157-L161

* `i64`: the [big-endian](https://en.wikipedia.org/wiki/Endianness) binary encoding, but with the
   sign bit flipped to order negative numbers before positive ones.

    https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L131-L143

* `f64`: the [big-endian IEEE 754](https://en.wikipedia.org/wiki/Double-precision_floating-point_format)
  binary encoding, but with the sign bit flipped, and all bits flipped for negative numbers, to
  order negative numbers correctly.

    https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L167-L179

* `Vec<u8>`: terminated by `00 00`, with `00` escaped as `00 ff` to disambiguate it.

    https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L190-L205

* `String`: like `Vec<u8>`.

    https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L185-L188

* `Vec<T>`, `[T]`, `(T,)`: the concatenation of the inner values.

    https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L295-L307

* `enum`: the variant's numerical index as a `u8`, then the inner values (if any).

    https://github.com/erikgrinaker/toydb/blob/2027641004989355c2162bbd9eeefcc991d6b29b/src/encoding/keycode.rs#L223-L227

Like `encoding::Value`, there is also an `encoding::Key` helper trait:

https://github.com/erikgrinaker/toydb/blob/b57ae6502e93ea06df00d94946a7304b7d60b977/src/encoding/mod.rs#L20-L37

Different kinds of keys are usually represented as enums. For example, if we wanted to store cars
and video games, we could use:

```rust
#[derive(serde::Serialize, serde::Deserialize)]
enum Key {
    Car(String, String, u64),    // make, model, year
    Game(String, u64, Platform), // name, year, platform
}

#[derive(serde::Serialize, serde::Deserialize)]
enum Platform {
    PC,
    PS5,
    Switch,
    Xbox,
}

impl encoding::Key for Key {}

let returnal = Key::Game("Returnal".into(), 2021, Platform::PS5);
let bytes = returnal.encode();
println!("{bytes:02x?}");

// Outputs [01, 52, 65, 74, 75, 72, 6e, 61, 6c, 00, 00, 00, 00, 00, 00, 00, 00, 07, e5, 01].
//
// * Key::Game: 01
// * Returnal: 52 65 74 75 72 6e 61 6c 00 00
// * 2021: 00 00 00 00 00 00 07 e5
// * Platform::PS5: 01

let returnal = Key::decode(&bytes)?;
```

Because the keys are sorted in element-wise order, this would allow us to e.g. perform a prefix
scan to fetch all platforms which Returnal (2021) was released on, or perform a range scan to fetch 
all models of Nissan Altima released between 2010 and 2015.

---

<p align="center">
← <a href="storage.md">Storage Engine</a> &nbsp; | &nbsp; <a href="mvcc.md">MVCC Transactions</a> →
</p>

================================================
FILE: docs/architecture/index.md
================================================
# toyDB Architecture

toyDB is a simple distributed SQL database, intended to illustrate how such systems are built. The
overall structure is similar to real-world distributed databases, but the design and implementation
has been kept as simple as possible for understandability. Performance and scalability are explicit
non-goals, as these are major sources of complexity in real-world systems.

This guide will walk through toyDB's architecture and code from the bottom up, with plenty of links
to the actual source code.

> ℹ️ View on GitHub with a desktop browser for inline code listings.

* [Overview](overview.md)
  * [Properties](overview.md#properties)
  * [Components](overview.md#components)
* [Storage Engine](storage.md)
  * [`Memory` Storage Engine](storage.md#memory-storage-engine)
  * [`BitCask` Storage Engine](storage.md#bitcask-storage-engine)
* [Key/Value Encoding](encoding.md)
  * [`Bincode` Value Encoding](encoding.md#bincode-value-encoding)
  * [`Keycode` Key Encoding](encoding.md#keycode-key-encoding)
* [MVCC Transactions](mvcc.md)
* [Raft Consensus](raft.md)
  * [Log Storage](raft.md#log-storage)
  * [State Machine Interface](raft.md#state-machine-interface)
  * [Node Roles](raft.md#node-roles)
  * [Node Interface and Communication](raft.md#node-interface-and-communication)
  * [Leader Election and Terms](raft.md#leader-election-and-terms)
  * [Client Requests and Forwarding](raft.md#client-requests-and-forwarding)
  * [Write Replication and Application](raft.md#write-replication-and-application)
  * [Read Processing](raft.md#read-processing)
* [SQL Engine](sql.md)
  * [Data Model](sql-data.md)
    * [Data Types](sql-data.md#data-types)
    * [Schemas](sql-data.md#schemas)
    * [Expressions](sql-data.md#expressions)
  * [Storage](sql-storage.md)
    * [Key/Value Representation](sql-storage.md#keyvalue-representation)
    * [Schema Catalog](sql-storage.md#schema-catalog)
    * [Row Storage and Transactions](sql-storage.md#row-storage-and-transactions)
  * [Raft Replication](sql-raft.md)
  * [Parsing](sql-parser.md)
    * [Lexer](sql-parser.md#lexer)
    * [Abstract Syntax Tree](sql-parser.md#abstract-syntax-tree)
    * [Parser](sql-parser.md#parser)
  * [Planning](sql-planner.md)
    * [Execution Plan](sql-planner.md#execution-plan)
    * [Scope and Name Resolution](sql-planner.md#scope-and-name-resolution)
    * [Planner](sql-planner.md#planner)
  * [Optimization](sql-optimizer.md)
    * [Constant Folding](sql-optimizer.md#constant-folding)
    * [Filter Pushdown](sql-optimizer.md#filter-pushdown)
    * [Index Lookups](sql-optimizer.md#index-lookups)
    * [Hash Join](sql-optimizer.md#hash-join)
    * [Short Circuiting](sql-optimizer.md#short-circuiting)
  * [Execution](sql-execution.md)
    * [Plan Executor](sql-execution.md#plan-executor)
    * [Session Management](sql-execution.md#session-management)
* [Server](server.md)
  * [Raft Routing](server.md#raft-routing)
  * [SQL Service](server.md#sql-service)
  * [`toydb` Binary](server.md#toydb-binary)
* [Client](client.md)
  * [Client Library](client.md#client-library)
  * [`toysql` Binary](client.md#toysql-binary)

---

<p align="center">
<a href="overview.md">Overview</a> →
</p>

================================================
FILE: docs/architecture/mvcc.md
================================================
# MVCC Transactions

Transactions are groups of reads and writes (e.g. to different keys) that are submitted together as
a single unit. For example, a bank transaction that transfers $100 from account A to account B might
consist of this group of reads and writes:

```
a = get(A)
b = get(B)
if a < 100:
    error("insufficient balance")
set(A, a - 100)
set(B, b + 100)
```

toyDB provides [ACID](https://en.wikipedia.org/wiki/ACID) transactions, a set of very strong
guarantees:

* **Atomicity:** all of the writes take effect as an single, atomic unit, at the same instant, when
  they are _committed_. Other users will never see some of the writes without the others.

* **Consistency:** database constraints are never violated (e.g. referential integrity or uniqueness
  contraints). We'll see how this is implemented later in the SQL execution layer.

* **Isolation:** users should appear to have the entire database to themselves, unaffected by other
  simultaneous users. Two transactions may conflict, in which case one has to retry, but if a
  transaction succeeds then the user knows with certainty that the operations were executed without
  interference by anyone else. This eliminates the risk of [race conditions](https://en.wikipedia.org/wiki/Race_condition).
  
* **Durability:** committed writes are never lost (even if the system crashes).

To illustrate how transactions work, here's an example MVCC test script where two concurrent users
modify a set of bank accounts (there's many [other test scripts](https://github.com/erikgrinaker/toydb/tree/aa14deb71f650249ce1cab8828ed7bcae2c9206e/src/storage/testscripts/mvcc)
there too):

https://github.com/erikgrinaker/toydb/blob/a73e24b7e77671b9f466e0146323cd69c3e27bdf/src/storage/testscripts/mvcc/bank#L1-L69

To provide these guarantees, toyDB uses a common technique called
[Multi-Version Concurrency Control](https://en.wikipedia.org/wiki/Multiversion_concurrency_control)
(MVCC). It is implemented at the key/value storage level, in the [`storage::mvcc`](https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs)
module. It uses a `storage::Engine` for actual data storage.

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L220-L231

MVCC provides an [isolation level](https://en.wikipedia.org/wiki/Isolation_(database_systems)#Isolation_levels)
called [snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation): a transaction sees a
snapshot of the database as it was when the transaction began. Any later changes are invisible to
it.

It does this by storing historical versions of key/value pairs. The version number is simply a
number that's incremented for every new transaction:

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L155-L158

Each transaction has its own unique version number. When it writes a key/value pair it appends its
version number to the key as `Key::Version(&[u8], Version)` (using the Keycode encoding we've seen
previously). If an old version of the key already exists, it will have a different version number
suffix and therefore be stored as a separate key in the storage engine. Deleted keys are versions
with a special tombstone value.

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L183-L189

Here's a simple diagram of what a history of versions 1 to 5 of keys `a` to `d` might look like:

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L11-L26

Additionally, we need to keep track of the currently ongoing (uncommitted) transaction versions,
known as the "active set".

With versioning and the active set, we can summarize the MVCC protocol with a few simple rules:

1. When a new transaction begins, it:
    * Obtains the next available version number.
    * Takes a snapshot of the active set (other uncommitted transactions).
    * Adds its version number to the active set.

2. When the transaction reads a key, it:
    * Returns the latest version of the key at or below its own version.
    * Ignores versions above its own version.
    * Ignores versions in its active set snapshot.

3. When the transaction writes a key, it:
    * Looks for a key version above its own version; errors if found.
    * Looks for a key version in its active set snapshot; errors if found.
    * Writes a key/value pair with its own version.

4. When the transaction commits, it:
    * Flushes all writes to disk.
    * Removes itself from the active set.

The magic happens when the transaction removes itself from the active set. This is a single, atomic
operation, and when it completes all of its writes immediately become visible to _new_ transactions.
However, ongoing transactions still won't see these writes, because the version is still in their
active set snapshot or at a later version (hence they are isolated from this transaction).

Furthermore, the transaction could see its own uncommitted writes even though noone else could, and
if any writes conflicted with another transaction it would error out and have to retry.

Not only that, this also allows us to do time-travel queries, where we can query the database as it
was at any time in the past: we simply pick a version number to read at.

There are a few more details that we've left out here: transaction rollbacks need to keep track of
the writes and undo them, and read-only queries can avoid allocating new version numbers. We also
don't garbage collect old version, for simplicity. See the module documentation for more details:

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L1-L140

Let's walk through a simple example with code pointers to get a feel for how this is implemented.
Notice how we don't have to deal with any version numbers when we're using the MVCC API -- this is
an internal MVCC implementation detail.

```rust
// Open a BitCask database in the file "toy.db" with MVCC support.
let path = PathBuf::from("toy.db");
let db = MVCC::new(BitCask::new(path)?);

// Begin a new transaction.
let txn = db.begin()?;

// Read the key "foo", and decode the binary value as a u64 with bincode.
let bytes = txn.get(b"foo")?.expect("foo not found");
let mut value: u64 = bincode::deserialize(&bytes)?;

// Delete "foo".
txn.delete(b"foo")?;

// Add 1 to the value, and write it back to the key "bar".
value += 1;
let bytes = bincode::serialize(&value);
txn.set(b"bar", bytes)?;

// Commit the transaction.
txn.commit()?;
```

First, we begin a new transaction with `MVCC::begin()`, which calls through to
`Transaction::begin()`. This obtains a version number stored in `Key::NextVersion` and increments
it, then takes a snapshot of the active set in `Key::ActiveSet` and adds itself to it:

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L368-L391

This returns a `Transaction` object which provides the main key/value API, with get/set/delete
methods. It keeps track of the main state of the transaction: it's version number and active set.

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L294-L327

Next, we call `Transaction::get(b"foo")` to read the value of the key `foo`. This finds the latest
version that's visible to us (ignoring future versions and the active set). Recall that we store
multiple version of each key as `Key::Version(key, version)`. The Keycode encoding ensures that all
versions are stored in sorted order, so we can do a reverse range scan from `Key::Version(b"foo",
self.version)` to  `Key::Version(b"foo", 0)` and return the latest version that's visible to us:

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L564-L581

We then call `Transaction::delete(b"foo")` and `Transaction::set(b"bar", value)`. Both of these just
call through to the same `Transaction::write_version()` method, but use `Some(value)` for a regular
key/value pair and `None` as a deletion tombstone:

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L514-L522

To write a new version of a key, we first have to check for conflicts by seeing if there's a
version of the key that's invisible to us -- if it is, we conflicted with a concurrent transaction.
We use a range scan for this, like we did in `Transaction::get()`.

If there are no conflicts, we go on to write `Key::Version(b"foo", self.version)` and encode the
value as an `Option<value>` to accomodate the `None` tombstone marker. We also write a
`Key::TxnWrite(version, key)` to keep track of the keys we've written in case we have to roll back.

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L524-L562

Finally, `Transaction::commit()` will make our transaction take effect and become visible. It does
this simply by removing itself from the active set in `Key::ActiveSet`, and also cleaning up its
`Key::TxnWrite` write tracking. As the comment says, we don't actually have to flush to durable
storage here, because the Raft log will provide durability for us -- we'll get back to this later.

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/mvcc.rs#L466-L485

---

<p align="center">
← <a href="encoding.md">Key/Value Encoding</a> &nbsp; | &nbsp; <a href="raft.md">Raft Consensus</a> →
</p>

================================================
FILE: docs/architecture/overview.md
================================================
# Overview

toyDB consists of a cluster of nodes that execute [SQL](https://en.wikipedia.org/wiki/SQL)
transactions against a replicated state machine. Clients can connect to any node in the cluster and
submit SQL statements. The cluster remains available if a minority of nodes crash or disconnect,
but halts if a majority of nodes fail.

## Properties

* **Distributed:** runs across a cluster of nodes.
* **Highly available:** tolerates failure of a minority of nodes.
* **SQL compliant:** correctly supports most common [SQL](https://en.wikipedia.org/wiki/SQL)
  features.
* **Strongly consistent:** committed writes are immediately visible to all readers ([linearizability](https://en.wikipedia.org/wiki/Linearizability)).
* **Transactional:** provides [ACID](https://en.wikipedia.org/wiki/ACID) transactions
  * **Atomic:** groups of writes are applied as a single, atomic unit.
  * **Consistent:** database constraints and referential integrity are always enforced.
  * **Isolated:** concurrent transactions don't affect each other ([snapshot isolation](https://en.wikipedia.org/wiki/Snapshot_isolation)).
  * **Durable:** committed writes are never lost.

For simplicity, toyDB is:

* **Not scalable:** every node stores the full dataset, and reads/writes execute on one node.
* **Not reliable:** only handles crash failures, not e.g. partial network partitions or node stalls.
* **Not performant:** data processing is slow, and not optimized at all.
* **Not efficient:** loads entire tables into memory, no compression or garbage collection, etc.
* **Not full-featured:** only basic SQL functionality is implemented.
* **Not backwards compatible:** changes to data formats and protocols will break databases.
* **Not flexible:** nodes can't be added or removed while running, and take a long time to join.
* **Not secure:** there is no authentication, authorization, nor encryption.

## Components

Internally, toyDB is made up of a few main components:

* **Storage engine:** stores data on disk and manages transactions.
* **Raft consensus engine:** replicates data and coordinates cluster nodes.
* **SQL engine:** organizes SQL data, manages SQL sessions, and executes SQL statements.
* **Server:** manages network communication, both with SQL clients and Raft nodes.
* **Client:** provides a SQL user interface and communicates with the server.

This diagram illustrates the internal structure of a single toyDB node:

![toyDB architecture](./images/architecture.svg)

We will go through each of these components from the bottom up.

---

<p align="center">
← <a href="index.md">toyDB Architecture</a> &nbsp; | &nbsp; <a href="storage.md">Storage Engine</a> →
</p>

================================================
FILE: docs/architecture/raft.md
================================================
# Raft Consensus

[Raft](https://raft.github.io) is a distributed consensus protocol which replicates data across a
cluster of nodes in a consistent and durable manner. It is described in the very readable
[Raft paper](https://raft.github.io/raft.pdf), and in the more comprehensive
[Raft thesis](https://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf).

The toyDB Raft implementation is in the [`raft`](https://github.com/erikgrinaker/toydb/tree/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/raft)
module, and is described in the module documentation:

https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/mod.rs#L1-L240

Raft is fundamentally the same protocol as [Paxos](https://lamport.azurewebsites.net/pubs/paxos-simple.pdf)
and [Viewstamped Replication](https://pmg.csail.mit.edu/papers/vr-revisited.pdf), but an
opinionated variant designed to be simple, understandable, and practical. It is widely used in the
industry: [CockroachDB](https://www.cockroachlabs.com), [TiDB](https://www.pingcap.com),
[etcd](https://etcd.io), [Consul](https://developer.hashicorp.com/consul), and many others.

Briefly, Raft elects a leader node which coordinates writes and replicates them to followers. Once a
majority (>50%) of nodes have acknowledged a write, it is considered durably committed. It is common
for the leader to also serve reads, since it always has the most recent data and is thus strongly
consistent.

A cluster must have a majority of nodes (known as a [quorum](https://en.wikipedia.org/wiki/Quorum_(distributed_computing)))
live and connected to remain available, otherwise it will not commit writes in order to guarantee
data consistency and durability. Since there can only be one majority in the cluster, this prevents
a [split brain](https://en.wikipedia.org/wiki/Split-brain_(computing)) scenario where two active
leaders can exist concurrently (e.g. during a [network partition](https://en.wikipedia.org/wiki/Network_partition))
and store conflicting values.

The Raft leader appends writes to an ordered command log, which is then replicated to followers.
Once a majority has replicated the log up to a given entry, that log prefix is committed and then
applied to a state machine. This ensures that all nodes will apply the same commands in the same
order and eventually reach the same state (assuming the commands are deterministic). Raft itself
doesn't care what the state machine and commands are, but in toyDB's case it's SQL tables and rows
stored in an MVCC key/value store.

This diagram from the Raft paper illustrates how a Raft node receives a command from a client (1),
adds it to its log and reaches consensus with other nodes (2), then applies it to its state machine
(3) before returning a result to the client (4):

<img src="./images/raft.svg" alt="Raft node" width="400" style="display: block; margin: 30px auto;">

You may notice that Raft is not very scalable, since all reads and writes go via the leader node,
and every node must store the entire dataset. Raft solves replication and availability, but not
scalability. Real-world systems typically provide horizontal scalability by splitting a large
dataset across many separate Raft clusters (i.e. sharding), but this is out of scope for toyDB.

For simplicitly, toyDB implements the bare minimum of Raft, and omits optimizations described in
the paper such as state snapshots, log truncation, leader leases, and more. The implementation is
in the [`raft`](https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/mod.rs)
module, and we'll walk through the main components next.

There is a comprehensive set of Raft test scripts in [`src/raft/testscripts/node`](https://github.com/erikgrinaker/toydb/blob/386153f5c00cb1a88b1ac8489ae132674d96f68a/src/raft/testscripts/node),
which illustrate the protocol in a wide variety of scenarios.

## Log Storage

Raft replicates an ordered command log consisting of `raft::Entry`:

https://github.com/erikgrinaker/toydb/blob/90a6cae47ac20481ac4eb2f20eea50f02e6c2b33/src/raft/log.rs#L10-L28

`index` specifies the position in the log, and `command` contains the binary command to apply to the
state machine. The `term` identifies the leadership term in which the command was proposed: a new
term begins when a new leader election is held (we'll get back to this later).

Entries are appended to the log by the leader and replicated to followers. Once acknowledged by a
quorum, the log up to that index is committed and will never change. Entries that are not yet
committed may be replaced or removed if the leader changes.

The Raft log enforces the following invariants:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L80-L91

`raft::Log` implements a Raft log, and stores log entries in a `storage::Engine` key/value store:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L43-L116

It also stores some additional metadata that we'll need later: the current term, vote, and commit
index. These are stored as separate keys:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L30-L39

Individual entries are appended to the log via `Log::append`, typically when the leader wants to
replicate a new write:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L190-L203

Entries can also be appended in bulk via `Log::splice`, typically when entries are replicated to
followers. This also allows replacing existing uncommitted entries, e.g. after a leader change:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L269-L343

Committed entries are marked by `Log::commit`, making them immutable and eligible for state machine
application:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L205-L222

The log also has methods to read entries from the log, either individually as `Log::get` or by
iterating over a range with `Log::scan`:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/log.rs#L224-L267

## State Machine Interface

Raft doesn't know or care what the log commands are, nor what the state machine does with them. It
simply takes `raft::Entry` from the log and gives them to the state machine.

The Raft state machine is represented by the `raft::State` trait. Raft will ask about the last
applied entry via `State::get_applied_index`, and feed it newly committed entries via
`State::apply`. It also allows reads via `State::read`, but we'll get back to that later.

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/state.rs#L4-L51

The state machine does not have to flush its state to durable storage after each transition; on node
crashes, the state machine is allowed to regress, and will be caught up by replaying the unapplied
log entries. It is also possible to implement a purely in-memory state machine (and in fact, toyDB
allows running the state machine with a `Memory` storage engine).

The state machine must take care to be deterministic: the same commands applied in the same order
must result in the same state across all nodes. This means that a command can't e.g. read the
current time or generate a random number -- these values must be included in the command. It also
means that non-deterministic errors, such as an IO error, must halt command application (in toyDB's
case, we just panic and crash the node).

In toyDB's, the state machine is an MVCC key/value store that stores SQL tables and rows, as we'll
see in the SQL Raft replication section.

## Node Roles

In Raft, a node can have one out of three roles:

* **Leader:** replicates writes to followers and serves client requests.
* **Follower:** replicates writes from a leader.
* **Candidate:** campaigns for leadership.

The Raft paper summarizes these roles and transitions in the following diagram (we'll discuss
leader election in detail below):

<img src="./images/raft-states.svg" alt="Raft states" width="400" style="display: block; margin: 30px auto;">

In toyDB, a node is represented by the `raft::Node` enum, with variants for each state:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L47-L66

This wraps the `raft::RawNode<Role>` type which contains the inner node state. It is generic over
the role, and uses the [typestate pattern](http://cliffle.com/blog/rust-typestate/) to provide
methods and transitions depending on the node's current role. This enforces state transitions and
invariants at compile time via Rust's type system -- for example, only `RawNode<Candidate>` has an
`into_leader()` method, since only candidates can transition to leaders (when they win an election).

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L156-L177

The `RawNode::role` field contains role-specific state as structs implementing the `Role` marker
trait:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L661-L680

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L242-L255

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L523-L531

We'll see what the various fields are used for in the following sections.

## Node Interface and Communication

The `raft::Node` enum has two main methods that drive the node: `tick()` and `step()`. These consume
the current node and return a new node, possibly with a different role.

`tick()` advances time by a logical tick. This is used to measure the passage of time, e.g. to
trigger election timeouts or periodic leader heartbeats. toyDB uses a tick interval of 100
milliseconds (see `raft::TICK_INTERVAL`), and will call `tick()` on the node at this rate.

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L125-L132

`step()` processes an inbound message from a different node or client:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L107-L123

Outbound messages to other nodes are sent via the `RawNode::tx` channel:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L171-L172

Nodes are identified by a unique node ID, which is given at node startup:

https://github.com/erikgrinaker/toydb/blob/90a6cae47ac20481ac4eb2f20eea50f02e6c2b33/src/raft/node.rs#L17-L18

Messages are wrapped in a `raft::Envelope` specifying the sender and recipient:

https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L10-L21

The envelope contains a `raft::Message`, an enum which encodes the Raft message protocol. We won't
dwell on the specific message types here, but discuss them invididually in the following sections.
Raft does not require reliable message delivery, so messages may be dropped or reordered at any
time, although toyDB's use of TCP provides stronger delivery guarantees.

https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L25-L152

This is an entirely synchronous and deterministic model -- the same sequence of calls on a given
node in a given initial state will always produce the same result. This is very convenient for
testing and understandability. We will see in the server section how toyDB drives the node on a
separate thread, provides a network transport for messages, and ticks it at regular intervals.

## Leader Election and Terms

In the steady state, Raft simply has a leader which replicates writes to followers. But to reach
this steady state, we must elect a leader, which is where much of the subtle complexity lies. See
the Raft paper for comprehensive details and safety arguments, we'll summarize it briefly below.

Raft divides time into _terms_. The term is a monotonically increasing number starting at 1. There
can only be one leader in a term (or none if an election fails), and the term can never regress.
Replicated commands belong to the specific term under which they were proposed.

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L20-L21

Let's walk through an election, where we bootstrap a brand new, empty toyDB cluster with 3 nodes.

Nodes are initialized by calling `Node::new()`. Since this is a new cluster, they are given an empty
`raft::Log` and `raft::State`, at term 0. Nodes start with role `Follower`, but without a leader.

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L68-L87

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L266-L290

Now, nothing really happens for a while, as the nodes are waiting to maybe hear from an existing
leader (there is none). Every 100 ms we call `tick()`, until we reach `election_timeout`:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L489-L497

Notice how `new()` set `election_timeout` to a random value (in the range `ELECTION_TIMEOUT_RANGE`
of 10-20 ticks, i.e. 1-2 seconds). If all nodes had the same timeout, they would likely campaign for
leadership simultaneously, resulting in an election tie -- Raft uses randomized election timeouts to
avoid such ties.

Once a node reaches `election_timeout` it transitions to role `Candidate`:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L292-L312

When it becomes a candidate it campaigns for leadership by increasing its term to 1, voting for
itself, and sending `Message::Campaign` to all peers asking for their vote:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L647-L658

In Raft, the term can't regress, and a node can only cast a single vote in each term (even across
restarts), so both of these are persisted to disk via `Log::set_term_vote()`.

When the two other nodes (still in state `Follower`) receive the `Message::Campaign` asking for a
vote, they will first increase their term to 1 (since this is a newer term than their local term 0):

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L347-L351

They then grant the vote since they haven't yet voted for anyone else in term 1. They persist the
vote to disk via `Log::set_term_vote()` and return a `Message::CampaignResponse { vote: true }` to
the candidate:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L424-L449

They also check that the candidate's log is at least as long as theirs, which is trivially true in
this case since the log is empty. This is necessary to ensure that a leader has all committed
entries (see section 5.4.1 in the Raft paper).

When the candidate receives the `Message::CampaignResponse` it records the vote from each node. Once
it has a quorum (in this case 2 out of 3 votes including its own vote) it becomes leader in term 1:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L599-L606

When it becomes leader, it sends a `Message::Heartbeat` to all peers to tell them it is now the
leader in term 1. It also appends an empty entry to its log and replicates it, but we will ignore
this for now (see section 5.4.2 in the Raft paper for why).

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L563-L583

When the other nodes receive the heartbeat, they become followers of the new leader in its term:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L359-L384

From now on, the leader will send periodic `Message::Heartbeat` every 4 ticks (see
`HEARTBEAT_INTERVAL`) to assert its leadership:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L945-L953

The followers record when they last received any message from the leader (including heartbeats), and
will hold a new election if they haven't heard from the leader in an election timeout (e.g. due to a
leader crash or network partition):

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L353-L356

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L489-L497

This entire process is illustrated in the test script [`election`](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/election),
along with several other test scripts that show e.g. [election ties](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/election_tie),
[contested elections](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/election_contested),
and other scenarios:

https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/election#L1-L72

## Client Requests and Forwarding

Once a leader has been elected, we can submit read and write requests to it. This is done by
stepping a `Message::ClientRequest` into the node using the local node ID, with a unique request ID
(toyDB uses UUIDv4), and waiting for an outbound response message with the same ID:

https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L134-L151

https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L164-L188

The requests and responses themselves are arbitrary binary data which is interpreted by the state
machine. For our purposes here, let's pretend the requests are:

* `Request::Write("key=value")` → `Response::Write("ok")`
* `Request::Read("key")` → `Response::Read("value")`

The fundamental difference between read and write requests are that write requests are replicated
through Raft and executed on all nodes, while read requests are only executed on the leader without
being appended to the log. It would be possible to execute reads on followers too, for load
balancing, but these reads would be eventually consistent and thus violate linearizability, so toyDB
only executes reads on the leader.

If a request is submitted to a follower, it will be forwarded to the leader and the response
forwarded back to the client (distinguished by the sender/recipient node ID -- a local client always
uses the local node ID):

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L451-L474

For simplicity, we cancel the request with `Error::Abort` if a request is submitted to a candidate,
and similarly if a follower changes its role to candidate or discovers a new leader. We could have
held on to these and redirected them to a new leader, but we keep it simple and ask the client to
retry.

We'll look at the actual read and write request processing next.

## Write Replication and Application

When the leader receives a write request, it proposes the command for replication to followers. It
keeps track of the in-flight write and its log entry index in `writes`, such that it can respond to
the client with the command result once the entry has been committed and applied.

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L895-L904

To propose the command, the leader appends it to its log and sends a `Message::Append` to each
follower to replicate it to their logs:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L966-L980

In steady state, `Message::Append` just contains the single log entry we appended above:

https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L87-L108

However, sometimes followers may be lagging behind the leader (e.g. after a crash), or their log may
have diverged from the leader (e.g. unsuccessful proposals from a stale leader after a network
partition). To handle these cases, the leader tracks the replication progress of each follower as
`raft::Progress`:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L682-L698

We'll gloss over these cases here (see the Raft paper and the code in `raft::Progress` and
`maybe_send_append()` for details). In the steady state, where each entry is successfully appended
and replicated one at a time, `maybe_send_append()` will fall through to the bottom and send a
single entry:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L1068-L1128

The `Message::Append` contains the index/term of the entry immediately before the new entry as
`base_index` and `base_term`. If the follower's log also contains an entry with this index and term
then its log is guaranteed to match (be equal to) the leader's log up to this entry (see section 5.3
in the Raft paper). The follower can then append the new log entry and return a
`Message::AppendResponse` confirming that the entry was appended and that its log matches the
leader's log up to `match_index`:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L386-L410

When the leader receives the `Message::AppendResponse`, it will update its view of the follower's
`match_index`.

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L844-L858

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L701-L710

Once a quorum of nodes (in our case 2 out of 3 including the leader) have the entry in their log,
the leader can commit the entry and apply it to the state machine. It also looks up the in-flight
write request from `writes` and sends the command result back to the client as
`Message::ClientResponse`:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L982-L1032

The leader will also propagate the new commit index to followers via the next heartbeat, so that
they can also apply any pending log entries to their state machine. This isn't strictly necessary,
since reads are executed on the leader and nodes have to apply pending entries before becoming
leaders, but we do it anyway so that they don't fall too far behind on application.

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L359-L384

This process is illustrated in the test scripts [`append`](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/append) and [`heartbeat_commits_follower`](https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/heartbeat_commits_follower)
(along with many other scenarios):

https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/append#L1-L43

https://github.com/erikgrinaker/toydb/blob/cb234a0b776484608118fd9382869ee5bc30d4f0/src/raft/testscripts/node/heartbeat_commits_follower#L1-L50

## Read Processing

For linearizable (aka strongly consistent) reads, we must execute read requests on the leader, as
mentioned above. However, this is not sufficient: under e.g. a network partition, a node may think
it's still the leader while in fact a different leader has been elected elsewhere (in a later term)
and executed writes there.

To handle this case, the leader must confirm that it is still the leader for each read, by sending a
`Message::Read` to its followers containing a read sequence number. Only if a quorum confirms that
it is still the leader can the read be executed. This incurs an additional network roundtrip, which
is clearly inefficient, so real-world systems often use leader leases instead (see section 6.4.1 of
the Raft _thesis_, not the paper) -- but it's fine for toyDB.

https://github.com/erikgrinaker/toydb/blob/d96c6dd5ae7c0af55ee609760dcd958c289a44f2/src/raft/message.rs#L125-L132

When the leader receives the read request, it increments the read sequence number, stores the
pending read request in `reads`, and sends a `Message::Read` to all followers:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L906-L917

When the followers receive the `Message::Read`, they simply respond with a `Message::ReadResponse`
if it's from their current leader (messages from stale terms are ignored):

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L342-L346

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L412-L422

When the leader receives the `Message::ReadResponse` it records it in the peer's `Progress`, and
executes the read once a quorum have confirmed the sequence number:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L860-L866

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/node.rs#L1034-L1066

We now have a Raft-managed state machine with replicated writes and linearizable reads.

---

<p align="center">
← <a href="mvcc.md">MVCC Transactions</a> &nbsp; | &nbsp; <a href="sql.md">SQL Engine</a> →
</p>

================================================
FILE: docs/architecture/server.md
================================================
# Server

Now that we've gone over the individual components, we'll tie them all together in the toyDB
server `toydb::Server`, located in the [`server`](https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs) module.

The server wraps an inner Raft node `raft::Node`, which manages the SQL state machine, and is
responsible for routing network traffic between the Raft node, its Raft peers, and SQL clients.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L27-L44

For network protocol, the server uses the Bincode encoding that we've discussed in the encoding
section, sent over a TCP connection. There's no need for any further framing, since Bincode knows
how many bytes to expect for each message depending on the type it's decoding into.

The server does not use [async Rust](https://rust-lang.github.io/async-book/) and e.g.
[Tokio](https://tokio.rs), instead opting for regular OS threads. Async Rust can significantly
complicate the code, which would obscure the main concepts, and any efficiency gains would be
entirely irrelevant for toyDB.

Internally in the server, messages are passed around between threads using
[Crossbeam channels](https://docs.rs/crossbeam/latest/crossbeam/channel/index.html).

The main server loop `Server::serve()` listens for inbound TCP connections on port 9705 for Raft
peers and 9605 for SQL clients, and spawns threads to process them. We'll look at Raft and SQL
services separately.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L66-L110

## Raft Routing

The heart of the server is the Raft processing thread `Server::raft_route()`. This is responsible
for periodically ticking the Raft node via `raft::Node::tick()`, stepping inbound messages from
Raft peers into the node via `raft::Node::step()`, and sending outbound messages to peers.

It also takes inbound Raft client requests from the `sql::engine::Raft` SQL engine, steps them
into the Raft node via `raft::Node::step()`, and passes responses back to the appropriate client
as the node emits them.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L169-L249

When the node starts up, it spawns a `Server::raft_send_peer()` thread for each Raft peer to send
outbound messages to them.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L84-L91

These threads continually attempt to connect to the peer via TCP, and then read any outbound
`raft::Envelope(raft::Message)` messages from `Server::raft_route()` via a channel and writes the
messages into the TCP connection using Bincode:

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L146-L167

The server also continually listens for inbound Raft TCP connections from peers in
`Server::raft_accept()`:

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L112-L134

When an inbound connection is accepted, a `Server::raft_receive_peer()` thread is spawned that reads
Bincode-encoded `raft::Envelope(raft::Message)` messages from the TCP connection and sends them to
`Server::raft_route()` via a channel.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L136-L144

The Raft cluster is now fully connected, and the nodes can all talk to each other.

## SQL Service

Next, let's serve some SQL clients. The SQL service uses the enums `toydb::Request` and
`toydb::Response` as a client protocol, again Bincode-encoded over TCP.

The primary request type is `Request::Execute` which executes a SQL statement against a
`sql::execution::Session` and returns a `sql::execution::StatementResult`, as we've seen previously.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L312-L337

The server sets up a `sql::engine::Raft` SQL engine, with a Crossbeam channel that's used to send
`raft::Request` Raft client requests to `Server::raft_route()` and onwards to the local
`raft::Node`.  It then spawns a `Server::sql_accept()` thread to listen for inbound SQL client
connections:

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L104-L106

When a SQL client connection is accepted, a new client session `sql::execution::Session` is set up
for the client, and we spawn a `Server::sql_session()` thread to serve the connection:

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L251-L272

These session threads continually read `Request` messages from the client, execute them against the
SQL session (and ultimately the Raft node), before sending a `Response` back to the client.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/server.rs#L274-L309

## `toydb` Binary

The `toydb` binary in `src/bin/toydb.rs` launches the server, and is a thin wrapper around
`toydb::Server`. It is a tiny [`clap`](https://docs.rs/clap/latest/clap/) command:

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/bin/toydb.rs#L82-L89

It first parses a server configuration from the `toydb.yaml` file:

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/bin/toydb.rs#L30-L59

Then it initializes the Raft log storage and SQL state machine:

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/bin/toydb.rs#L105-L133

And finally it launches the `toydb::Server`:

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/bin/toydb.rs#L135-L137

toyDB is now up and running!

---

<p align="center">
← <a href="sql-execution.md">SQL Execution</a> &nbsp; | &nbsp; <a href="client.md">Client</a> →
</p>

================================================
FILE: docs/architecture/sql-data.md
================================================
# SQL Data Model

The SQL data model represents user data in tables and rows. It is made up of data types and schemas,
in the [`sql::types`](https://github.com/erikgrinaker/toydb/tree/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/types)
module.

## Data Types

toyDB supports four basic scalar data types as `sql::types::DataType`: booleans, integers, floats,
and strings.

https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L15-L27

Specific values are represented as `sql::types::Value`, using the corresponding Rust types. toyDB
also supports SQL `NULL` values, i.e. unknown values, following the rules of
[three-valued logic](https://en.wikipedia.org/wiki/Three-valued_logic).

https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L40-L64

The `Value` type provides basic formatting, conversion, and mathematical operations.

https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/types/value.rs#L68-L79

https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/types/value.rs#L164-L370

It also specifies comparison and ordering semantics, but these are subtly different from the SQL
semantics. For example, in Rust code `Value::Null == Value::Null` yields `true`, while in SQL
`NULL = NULL` yields `NULL`.  This mismatch is necessary for the Rust code to properly detect and
process `Null` values, and the desired SQL semantics are implemented during expression evaluation
which we'll cover below.

https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L91-L162

During execution, a row of values is represented as `sql::types::Row`, with multiple rows emitted
via `sql::types::Rows` row iterators:

https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L378-L388

## Schemas

toyDB schemas only support tables. There are no named indexes or constraints, and there's only a
single unnamed database.

Tables are represented by `sql::types::Table`:

https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/types/schema.rs#L12-L25

A table is made up of a set of columns, represented by `sql::types::Column`. These support the data
types described above, along with unique constraints, foreign keys, and secondary indexes.

https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/types/schema.rs#L29-L53

The table name serves as a unique identifier, and can't be changed later. In fact, tables schemas
are entirely static: they can only be created or dropped (there are no schema changes).

Table schemas are stored in the catalog, represented by the `sql::engine::Catalog` trait. We'll
revisit the implementation of this trait in the SQL storage section.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/engine/engine.rs#L60-L79

Table schemas are validated when created via `Table::validate()`, which enforces invariants and
internal consistency. It uses the catalog to look up information about other tables, e.g. that
foreign key references point to a valid target column in a different table.

https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/types/schema.rs#L98-L170

Table rows are validated via `Table::validate_row()`, which ensures that a `sql::types::Row`
conforms to the schema (e.g. that value types match the column data types). It uses a
`sql::engine::Transaction` to look up other rows in the database, e.g. to check for primary key
conflicts (we'll get back to this later).

https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/types/schema.rs#L172-L236

## Expressions

During SQL execution, we also have to model _expressions_, such as `1 + 2 * 3`. These are
represented as values and operations on them, and can be nested as a tree to represent compound
operations.

https://github.com/erikgrinaker/toydb/blob/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/types/expression.rs#L11-L64


For example, the expression `1 + 2 * 3` (taking [precedence](https://en.wikipedia.org/wiki/Order_of_operations)
into account) is represented as:

```rust
//    +
//   / \
//  1   *
//     /  \
//    2    3
Expression::Add(
    Expression::Constant(Value::Integer(1)),
    Expression::Multiply(
        Expression::Constant(Value::Integer(2)),
        Expression::Constant(Value::Integer(3)),
    ),
)
```

An `Expression` can contain two kinds of values: constant values as
`Expression::Constant(sql::types::Value)`, and dynamic values as `Expression::Column(usize)` column
references. The latter will fetch a `sql::types::Value` from a `sql::types::Row` at the specified
index during evaluation.

We'll see later how the SQL parser and planner transforms text expression like `1 + 2 * 3` into an
`Expression`, and how it resolves column names to row indexes like `price * 0.25` to
`row[3] * 0.25`.

Expressions are evaluated recursively via `Expression::evalute()`, given a `sql::types::Row` with
input values for column references, and return a final `sql::types::Value` result:

https://github.com/erikgrinaker/toydb/blob/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/types/expression.rs#L73-L208

Many of the comparison operations like `==` are implemented explicitly here instead of using
`sql::types::Value` comparisons. This is where we implement the SQL semantics of special values like
`NULL`, such that `NULL = NULL` yields `NULL` instead of `TRUE`.

For mathematical operations however, we generally dispatch to these methods on `sql::types::Value`:

https://github.com/erikgrinaker/toydb/blob/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql/types/value.rs#L185-L295

Expression parsing and evaluation is tested via test scripts in
[`sql/testscripts/expression`](https://github.com/erikgrinaker/toydb/tree/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/testscripts/expressions).

---

<p align="center">
← <a href="sql.md">SQL Engine</a> &nbsp; | &nbsp; <a href="sql-storage.md">SQL Storage</a> →
</p>

================================================
FILE: docs/architecture/sql-execution.md
================================================
# SQL Execution

Now that the planner and optimizer have done all the hard work of figuring out how to execute a
query, it's time to actually execute it.

## Plan Executor

Plan execution is done by `sql::execution::Executor` in the
[`sql::execution`](https://github.com/erikgrinaker/toydb/tree/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/execution)
module, using a `sql::engine::Transaction` to access the SQL storage engine.

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/execution/executor.rs#L14-L49

The executor takes a `sql::planner::Plan` as input, and will return an `ExecutionResult` depending
on the statement type.

https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L331-L339

When executing the plan, the executor will branch off depending on the statement type:

https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L57-L101

We'll focus on `SELECT` queries here, which are the most interesting.

toyDB uses the iterator model (also known as the volcano model) for query execution. In the case of
a `SELECT` query, the result is a row iterator, and pulling from this iterator by calling `next()`
will drive the entire execution pipeline by recursively calling `next()` on the child nodes' row
iterators. This maps very naturally onto Rust's iterators, and we leverage these to construct the
execution pipeline as nested iterators.

Execution itself is fairly straightforward, since we're just doing exactly what the planner tells us
to do in the plan. We call `Executor::execute_node` recursively on each `sql::planner:Node`,
starting with the root node. Each node returns a result row iterator that the parent node can pull
its input rows from, process them, and output the resulting rows via its own row iterator (with the
root node's iterator being returned to the caller):

https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L103-L104

`Executor::execute_node()` will simply look at the type of `Node`, recursively call
`Executor::execute_node()` on any child nodes, and then process the rows accordingly.

https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L103-L212

We won't discuss every plan node in detail, but let's consider the movie plan we've looked at
previously:

```
Select
└─ Order: movies.released desc
   └─ Projection: movies.title, movies.released, genres.name as genre
      └─ HashJoin: inner on movies.genre_id = genres.id
         ├─ Scan: movies (released >= 2000)
         └─ Scan: genres
```

We'll recursively call `execute_node()` until we end up in the two `Scan` nodes. These simply
call through to the SQL engine (either using Raft or local disk) via `Transaction::scan()`, passing
in the scan predicate if any, and return the resulting row iterator:

https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L203-L204

`HashJoin` will then join the output rows from the `movies` and `genres` iterators by using a
hash join. This builds an in-memory table for `genres` and then iterates over `movies`, joining
the rows:

https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L128-L141

https://github.com/erikgrinaker/toydb/blob/889aef9f24c0fa4d58e314877fa17559a9f3d5d2/src/sql/execution/join.rs#L103-L183

The `Projection` node will simply evaluate the (trivial) column expressions using each joined
row as input:

https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L179-L186

And finally the `Order` node will sort the results (which requires buffering them all in memory):

https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L173-L177

https://github.com/erikgrinaker/toydb/blob/686d3971a253bfc9facc2ba1b0e716cff5c109fb/src/sql/execution/executor.rs#L298-L328

The output row iterator of `Order` is returned via `ExecutionResult::Select`, and the caller can now
go ahead and pull the resulting rows from it.

## Session Management

The entry point to the SQL engine is the `sql::execution::Session`, which represents a single user
session. It is obtained via `sql::engine::Engine::session()`.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L14-L21

The session takes a series of raw SQL statement strings as input and parses them:

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L29-L33

For each statement, it returns a result depending on the kind of statement:

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L132-L148

The session itself performs transaction control. It handles `BEGIN`, `COMMIT`, and `ROLLBACK`
statements, and modifies the transaction accordingly.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L34-L70

Any other statements are processed by the SQL planner, optimizer, and executor as we've seen in
previous sections.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L77-L83

These statements are always executed using the session's current transaction. If there is no active
transaction, the session will create a new, implicit transaction for each statement.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/execution/session.rs#L87-L112

And with that, we have a fully functional SQL engine!

---

<p align="center">
← <a href="sql-optimizer.md">SQL Optimization</a> &nbsp; | &nbsp; <a href="server.md">Server</a> →
</p>

================================================
FILE: docs/architecture/sql-optimizer.md
================================================
# SQL Optimization

[Query optimization](https://en.wikipedia.org/wiki/Query_optimization) attempts to improve query
performance and efficiency by altering the execution plan. This is a deep and complex field, and
we can only scratch the surface here.

toyDB's query optimizer is very basic -- it only has a handful of rudimentary heuristic
optimizations to illustrate how the process works. Real-world optimizers use much more sophisticated
methods, including statistical analysis, cost estimation, adaptive execution, etc.

The optimizers are located in the [`sql::planner::optimizer`](https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs) module.
An optimizer `sql::planner::Optimizer` just takes in a plan node `sql::planner::Node` (the root node
in the plan), and returns an optimized node:

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L20-L25

Optimizations are always implemented as recursive node transformations. To help with this, `Node`
has the helper methods `Node::transform` and `Node::transform_expressions` which recurse into a node
or expression tree and call a given transformation closure on each node, as either
[pre-order](https://en.wikipedia.org/wiki/Tree_traversal#Pre-order,_NLR) or
[post-order](https://en.wikipedia.org/wiki/Tree_traversal#Post-order,_LRN) transforms:

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/plan.rs#L269-L371

A technique that's often useful during optimization is to convert expressions into
[conjunctive normal form](https://en.wikipedia.org/wiki/Conjunctive_normal_form), i.e. "an AND of
ORs". For example, the two following expressions are equivalent, but the latter is in conjunctive
normal form (it's a chain of ANDs):

```
(a AND b) OR (c AND d)  →  (a OR c) AND (a OR d) AND (b OR c) AND (b OR d)
```

This is useful because we can often move each AND operand independently around in the plan tree
and still get the same result -- we'll see this in action later. Expressions are converted into
conjunctive normal form via `Expression::into_cnf`, which is implemented using
[De Morgan's laws](https://en.wikipedia.org/wiki/De_Morgan%27s_laws):

https://github.com/erikgrinaker/toydb/blob/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/types/expression.rs#L289-L351

We'll have a brief look at all of toyDB's optimizers, which are listed here in the order they're
applied:

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L9-L18

Test scripts for the optimizers are in [`src/sql/testscripts/optimizers`](https://github.com/erikgrinaker/toydb/tree/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/testscripts/optimizers),
and show how query plans evolve as each optimizer is applied.

## Constant Folding

The `ConstantFolding` optimizer performs [constant folding](https://en.wikipedia.org/wiki/Constant_folding).
This pre-evaluates constant expressions in the plan during planning, instead of evaluating them
for every row during execution.

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L27-L30

For example, consider the query `SELECT 1 + 2 * 3 - foo FROM bar`. There is no point in
re-evaluating `1 + 2 * 3` for every row in `bar`, because the result is always the same, so we can
just evaluate this once during planning, transforming the expression into `7 - foo`.

Concretely, this plan:

```
Select
└─ Projection: 1 + 2 * 3 - bar.foo
   └─ Scan: bar
```

Should be transformed into this plan:

```
Select
└─ Projection: 7 - bar.foo
   └─ Scan: bar
```

To do this, `ConstantFolding` simply checks whether an `Expression` tree contains an
`Expression::Column` node -- if it doesn't, then it much be a constant expression (since that's the
only dynamic value in an expression), and we can evaluate it with a `None` input row and replace the
original expression node with an `Expression::Constant` node.

This is done recursively for each plan node, and recursively for each expression node (so it does
this both for `SELECT`, `WHERE`, `ORDER BY`, and all other parts of the query). Notably, it does a
post-order expression transform, so it starts at the expression leaf nodes and attempts to transform
each expression node as it moves back up the tree -- this allows it to iteratively evaluate constant
parts as far as possible for each branch.

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L32-L56

Additionally, `ConstantFolding` also short-circuits logical expressions. For example, the expression
`foo AND FALSE` will always be `FALSE`, regardless of what `foo` is, so we can replace it with
`FALSE`:

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L58-L84

As the code comment mentions though, this doesn't fold optimally: it doesn't attempt to rearrange
expressions, which would require knowledge of precedence rules. For example, `(1 + foo) - 2` could
be folded into `foo - 1` by first rearranging it as `foo + (1 - 2)`, but we don't do this currently.

## Filter Pushdown

The `FilterPushdown` optimizer attempts to push filter predicates as far down into the plan as
possible, to reduce the number of rows each node has to process.

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L90-L95

Recall the `movies` query plan from the planning section:

```
Select
└─ Order: movies.released desc
   └─ Projection: movies.title, movies.released, genres.name as genre
      └─ Filter: movies.released >= 2000
         └─ NestedLoopJoin: inner on movies.genre_id = genres.id
            ├─ Scan: movies
            └─ Scan: genres
```

Even though we're filtering on `release >= 2000`, the `Scan` node still has to read all of them from
disk and send them via Raft, and the `NestedLoopJoin` node still has to join all of them. It would
be nice if we could push this filtering into the `NestedLoopJoin` and `Scan` nodes and avoid this
extra work, and this is exactly what `FilterPushdown` does.

The only plan nodes that have predicates that can be pushed down are `Filter` nodes and
`NestedLoopJoin` nodes, so we recurse through the plan tree and look for these nodes, attempting
to push down.

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L97-L110

When it encounters the `Filter` node, it will extract the predicate and attempt to push it down
into its `source` node:

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L139-L153

If the source node is a `Filter`, `NestedLoopJoin`, or `Scan` node, then we can push the predicate
down into it by `AND`ing it with the existing predicate (if any).

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L112-L137

In our case, we were able to push the `Filter` into the `NestedLoopJoin`, and our plan now looks
like this:

```
Select
└─ Order: movies.released desc
   └─ Projection: movies.title, movies.released, genres.name as genre
      └─ NestedLoopJoin: inner on movies.genre_id = genres.id AND movies.released >= 2000
         ├─ Scan: movies
         └─ Scan: genres
```

But we're still not done, as we'd like to push `movies.released >= 2000` down into the `Scan` node.
Pushdown for join nodes is a little more tricky, because we can only push down parts of the
expression that reference one of the source nodes.

We first have to convert the expression into conjunctive normal form, i.e. and AND of ORs, as we've
discussed previously. This allows us to examine and push down each AND part in isolation, because it
has the same effect regardless of whether it is evaluated in the `NestedLoopJoin` node or one of
the source nodes. Our expression is already in conjunctive normal form, though.

We then look at each AND part, and check which side of the join it has column references for.  If it
only references one of the sides, then the expression can be pushed down into it. We also make some
effort here to move primary/foreign key constants across to both sides, but we'll gloss over that.

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L155-L247

This allows us to push down the `movies.released >= 2000` predicate into the corresponding `Scan`
node, significantly reducing the amount of data transferred across Raft:

```
Select
└─ Order: movies.released desc
   └─ Projection: movies.title, movies.released, genres.name as genre
      └─ NestedLoopJoin: inner on movies.genre_id = genres.id
         ├─ Scan: movies (released >= 2000)
         └─ Scan: genres
```

## Index Lookups

The `IndexLookup` optimizer uses primary key or secondary index lookups instead of full table
scans where possible.

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L250-L252

The optimizer itself is fairly straightforward. It assumes that `FilterPushdown` has already pushed
predicates down into `Scan` nodes, so it only needs to examine these. It converts the predicate into
conjunctive normal form, and looks for any parts that are direct column lookups -- i.e.
`column = value` (possibly a long OR chain of these).

If it finds any, and the column is either a primary key or secondary index column, then we convert
the `Scan` node into either a `KeyLookup` or `IndexLookup` node respectively. If there are any
further AND predicates remaining, we add a parent `Filter` node to keep these predicates.

For example, the following plan:

```
Select
└─ Scan: movies ((id = 1 OR id = 7 OR id = 3) AND released >= 2000)
```

Will be transformed into one that does individual key lookups rather than a full table scan:

```
Select
└─ Filter: movies.released >= 2000
   └─ KeyLookup: movies (1, 3, 7)
```

The code is as outlined above:

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L254-L303

Helped by `Expression::is_column_lookup()` and `Expression::into_column_values()`:

https://github.com/erikgrinaker/toydb/blob/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/types/expression.rs#L363-L421

## Hash Join

The `HashJoin` optimizer will replace a `NestedLoopJoin` with a `HashJoin` where possible.

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L305-L307

A [nested loop join](https://en.wikipedia.org/wiki/Nested_loop_join) is a very inefficient O(n²)
algorithm, which iterates over all rows in the right source for each row in the left source to see
if they match. However, it is completely general, and can join on arbitraily complex predicates.

In the common case where the join predicate is an equality comparison such as
`movies.genre_id = genres.id` (i.e. an [equijoin](https://en.wikipedia.org/wiki/Relational_algebra#θ-join_and_equijoin)),
then we can instead use a [hash join](https://en.wikipedia.org/wiki/Hash_join). This scans the right
table once, builds an in-memory hash table from it, and for each left row it looks up any right rows
in the hash table. This is a much more efficient O(n) algorithm.

In our previous movie example, we are in fact doing an equijoin:

```
Select
└─ Order: movies.released desc
   └─ Projection: movies.title, movies.released, genres.name as genre
      └─ NestedLoopJoin: inner on movies.genre_id = genres.id
         ├─ Scan: movies (released >= 2000)
         └─ Scan: genres
```

And so our `NestedLoopJoin` can be replaced by a `HashJoin`:

```
Select
└─ Order: movies.released desc
   └─ Projection: movies.title, movies.released, genres.name as genre
      └─ HashJoin: inner on movies.genre_id = genres.id
         ├─ Scan: movies (released >= 2000)
         └─ Scan: genres
```

The `HashJoin` optimizer is extremely simple: if the join predicate is an equijoin, use a hash join.
This isn't always a good idea (the right source can be huge and we can run out of memory for the
hash table), but we keep it simple.

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L309-L348

Of course there are many other join algorithms out there, and one of the harder problems in SQL
optimization is how to efficiently perform large N-way multijoins. We don't attempt to tackle these
problems here -- the `HashJoin` optimizer is just a very simple example of such join optimization.

## Short Circuiting

The `ShortCircuit` optimizer tries to find nodes that can't possibly do any useful work, and either
removes them from the plan, or replaces them with trivial nodes that don't do anything. It is kind
of similar to the `ConstantFolding` optimizer in spirit, but works on plan nodes rather than
expression nodes.

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L350-L354

For example, `Filter` nodes with a `TRUE` predicate won't actually filter anything:

```
Select
└─ Filter: true
   └─ Scan: movies
```

So we can just remove them:

```
Select
└─ Scan: movies
```

Similarly, `Filter` nodes with a `FALSE` predicate will never emit anything:

```
Select
└─ Filter: false
   └─ Scan: movies
```

There's no point doing a scan in this case, so we can just replace it with a `Nothing` node that
does no work and doesn't emit anything:

```
Select
└─ Nothing
```

The optimizer tries to find a bunch of such patterns. This can also tidy up query plans a fair bit
by removing unnecessary cruft.

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/optimizer.rs#L356-L438

---

<p align="center">
← <a href="sql-planner.md">SQL Planning</a> &nbsp; | &nbsp; <a href="sql-execution.md">SQL Execution</a> →
</p>

================================================
FILE: docs/architecture/sql-parser.md
================================================
# SQL Parsing

We finally arrive at SQL. The SQL parser is the first stage in processing SQL queries and
statements, located in the [`sql::parser`](https://github.com/erikgrinaker/toydb/tree/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser)
module.

The SQL parser's job is to take a raw SQL string and turn it into a structured form that's more
convenient to work with. In doing so, it will validate that the string is in fact valid SQL
_syntax_. However, it doesn't know if the SQL statement actually makes sense -- it has no idea which
tables or columns exist, what their data types are, and so on. That's the job of the planner, which
we'll look at later.

For example, let's say the parser is given the following SQL query:

```sql
SELECT name, price, price * 25 / 100 AS vat
FROM products JOIN categories ON products.category_id = categories.id
WHERE categories.code = 'BLURAY' AND stock > 0
ORDER BY price DESC
LIMIT 10
```

It will generate a structure that looks something like this (in simplified syntax):

```rust
// A SELECT statement.
Statement::Select {
    // SELECT name, price, price * 25 / 100 AS vat
    select: [
        (Column("name"), None),
        (Column("price"), None),
        (
            Divide(
                Multiply(Column("price"), Integer(25)),
                Integer(100)
            ),
            Some("vat"),
        ),
    ]

    // FROM products JOIN categories ON products.category_id = categories.id
    from: [
        Join {
            left: Table("products"),
            right: Table("categories"),
            type: Inner,
            predicate: Some(
                Equal(
                    Column("products.category_id)",
                    Column("categories.id"),
                )
            )
        }
    ]

    // WHERE categories.code = 'BLURAY' AND stock > 0
    where: Some(
        And(
            Equal(
                Column("categories.code"),
                String("BLURAY"),
            ),
            GreaterThan(
                Column("stock"),
                Integer(0),
            )
        )
    )

    // ORDER BY price DESC
    order: [
        (Column("price"), Descending),
    ]

    // LIMIT 10
    limit: Some(Integer(10))
}
```

Let's have a look at how this happens.

## Lexer

We begin with the `sql::parser::Lexer`, which takes the raw SQL string and performs
[lexical analysis](https://en.wikipedia.org/wiki/Lexical_analysis) to convert it into a sequence of
tokens. These tokens are things like number, string, identifier, SQL keyword, and so on.

This preprocessing is useful to deal with some of the "noise" of SQL text, such as whitespace,
string quotes, identifier normalization, and so on. It also specifies which symbols and keywords are
valid in our SQL queries. This makes the parser's life a lot easier.

The lexer doesn't care about SQL structure at all, only that the individual pieces (tokens) of a
string are well-formed. For example, the following input string:

```
'foo' ) 3.14 SELECT + x
```

Will result in these tokens:

```
String("foo") CloseParen Number("3.14") Keyword(Select) Plus Ident("x")
```

Tokens and keywords are represented by the `sql::parser::Token` and `sql::parser::Keyword` enums
respectively:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L8-L47

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L86-L155

The lexer takes an input string and emits tokens as an iterator:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L311-L337

It does this by repeatedly attempting to scan the next token until it reaches the end of the string
(or errors). It can determine the kind of token by looking at the first character:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L358-L373

And then scan across the following characters as appropriate to generate a valid token. For example,
this is how a quoted string (e.g. `'foo'`) is lexed into a `Token::String` (including handling of
any escaped quotes inside the string):

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/lexer.rs#L435-L451

These tokens become the input to the parser.

## Abstract Syntax Tree

The end result of the parsing process will be an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree)
(AST), which is a structured representation of a SQL statement, located in the
[`sql::parser::ast`](https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/ast.rs) module.

The root of this tree is the `sql::parser::ast::Statement` enum, which represents all the different
kinds of SQL statements that we support, along with their contents:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/ast.rs#L6-L145

The nested tree structure is particularly apparent with expressions, which represent values and
operations on them. For example, the expression `2 * 3 - 4 / 2`, which evaluates to the value `4`.

We've seen in the data model section how such expressions are represented as
`sql::types::Expression`, but before we get there we have to parse them. The parser has its own
representation `sql::parser::ast::Expression` -- this is necessary e.g. because in the AST, we
represent columns as names rather than numeric indexes (we don't know yet which columns exist or
what their names are, we'll get to that during planning).

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/ast.rs#L147-L170

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/ast.rs#L204-L234

For example, `2 * 3 - 4 / 2` is represented as:

```rust
Expression::Operator(Operator::Subtract(
    // The left-hand operand of -
    Expression::Operator(Operator::Multiply(
        // The left-hand operand of *
        Expression::Literal(Literal::Integer(2)),
        // The right-hand operand of *
        Expression::Literal(Literal::Integer(3)),
    )),
    // The right-hand operand of -
    Expression::Operator(Operator::Divide(
        // The left-hand operand of /
        Expression::Literal(Literal::Integer(4)),
        // The right-hand operand of /
        Expression::Literal(Literal::Integer(2)),
    )),
))
```

## Parser

The parser, `sql::parser::Parser`, takes lexer tokens as input and builds an `ast::Statement`
from them:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L9-L32

We can determine the kind of statement we're parsing simply by looking at the first keyword:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L109-L130

Let's see how a `SELECT` statement is parsed. The different clauses in a `SELECT` (e.g. `FROM`,
`WHERE`, etc.) must always be given in a specific order, and they always begin with the appropriate
keyword, so we can simply try to parse each clause in the expected order:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L330-L342

Parsing each clause is also just a matter of parsing the expected parts in order. For example, the
initial `SELECT` clause is just a comma-separated list of expressions with an optional alias:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L344-L365

The `FROM` clause is a comma-separated list of table name, optionally joined with other tables:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L367-L427

And the `WHERE` clause is just a predicate expression to filter by:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L429-L435

Expression parsing is where this gets tricky, because we have to respect the rules of operator
precedence and associativity. For example, according to mathematical order of operations (aka
"PEMDAS") the expression `2 * 3 - 4 / 2` must be parsed as `(2 * 3) - (4 / 2)` which yields 4, not
`2 * (3 - 4) / 2` which yields -1.

toyDB does this using the [precedence climbing algorithm](https://en.wikipedia.org/wiki/Operator-precedence_parser#Precedence_climbing_method),
which is a fairly simple and compact algorithm as far as these things go. In a nutshell, it will
greedily and recursively group operators together as long as their precedence is the same or higher
than that of the operators preceding them (hence "precedence climbing"). For example:

```
-----   ----- Precedence 2: * and /
------------- Precedence 1: -
2 * 3 - 4 / 2
```

The algorithm is documented in more detail on `Parser::parse_expression()`:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/parser/parser.rs#L501-L696

---

<p align="center">
← <a href="sql-raft.md">SQL Raft Replication</a> &nbsp; | &nbsp; <a href="sql-planner.md">SQL Planning</a> →
</p>

================================================
FILE: docs/architecture/sql-planner.md
================================================
# SQL Planning

The SQL planner in the [`sql::planner`](https://github.com/erikgrinaker/toydb/tree/c64012e29c5712d6fe028d3d5375a98b8faea266/src/sql/planner)
module takes a SQL statement AST from the parser and generates an execution plan for it. We won't
actually execute it just yet though, only figure out how to execute it.

## Execution Plan

A plan is represented by the `sql::planner::Plan` enum. The variant specifies the operation to
execute (e.g. `SELECT`, `INSERT`, `UPDATE`, `DELETE`):

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/plan.rs#L15-L73

Below the root, the plan is typically made of up of a tree of nested `sql::planner::Node`. Each node
emits a stream of SQL rows as output, and may take streams of input rows from child nodes.

https://github.com/erikgrinaker/toydb/blob/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/planner/plan.rs#L106-L175

Here is an example, taken from the `Plan` code comment above:

```sql
SELECT title, released, genres.name AS genre
FROM movies INNER JOIN genres ON movies.genre_id = genres.id
WHERE released >= 2000
ORDER BY released
```

Which results in this query plan:

```
Select
└─ Order: movies.released desc
   └─ Projection: movies.title, movies.released, genres.name as genre
      └─ Filter: movies.released >= 2000
         └─ NestedLoopJoin: inner on movies.genre_id = genres.id
            ├─ Scan: movies
            └─ Scan: genres
```

Rows flow from the tree leaves to the root:

1. `Scan` nodes read rows from the tables `movies` and `genres`.
2. `NestedLoopJoin` joins the rows from `movies` and `genres`.
3. `Filter` discards rows with release dates older than 2000.
4. `Projection` picks out the requested column values from the rows.
5. `Order` sorts the rows by release date.
6. `Select` returns the final rows to the client.

## Scope and Name Resolution

One of the main jobs of the planner is to resolve column names to column indexes in the input rows
of each node.

In the query example above, the `WHERE released >= 2000` filter may refer to a column `released`
from either the joined `movies` table or the `genres` tables. The planner needs to figure out which
table has a `released` column, and also figure out which column number in the `NestedLoopJoin`
output rows corresponds to the `released` column (for example column number 2).

This job is further complicated by the fact that many nodes can alias, reorder, or drop columns,
and some nodes may also refer to columns that shouldn't be part of the result at all (for example,
it's possible to `ORDER BY` a column that won't be output by a `SELECT` projection at all, but
the `Order` node still needs access to the column data to sort by it).

The planner uses a `sql::planner::Scope` to keep track of which column names are currently visible,
and which column indexes they refer to. For each node the planner builds, starting from the leaves,
it creates a new `Scope` that contains the currently visible columns, tracking how they are modified
and rearranged by each node.

https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L577-L610

When an AST expression refers to a column name, the planner can use `Scope::lookup_column()` to find
out which column number the expression should take its input value from.

https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L660-L686

## Planner

The planner itself is `sql:planner::Planner`. It uses a `sql::engine::Catalog` to look up
information about tables and columns from storage.

https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L12-L20

To build an execution plan, the planner first looks at the `ast::Statement` kind to determine
what kind of plan to build:

https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L28-L47

Let's build this `SELECT` plan from above:

```sql
SELECT title, released, genres.name AS genre
FROM movies INNER JOIN genres ON movies.genre_id = genres.id
WHERE released >= 2000
ORDER BY released
```

Which should result in this plan:

```
Select
└─ Order: movies.released desc
   └─ Projection: movies.title, movies.released, genres.name as genre
      └─ Filter: movies.released >= 2000
         └─ NestedLoopJoin: inner on movies.genre_id = genres.id
            ├─ Scan: movies
            └─ Scan: genres
```

The planner is given the following (simplified) AST from the parser as input:

```rust
// A SELECT statement.
Statement::Select {
    // SELECT title, released, genres.name AS genre
    select: [
        (Column("title"), None),
        (Column("released"), None),
        (Column("genres.name"), "genre"),
    ]

    // FROM movies INNER JOIN genres ON movies.genre_id = genres.id
    from: [
        Join {
            left: Table("movies"),
            right: Table("genres"),
            type: Inner,
            predicate: Some(
                Equal(
                    Column("movies.genre_id"),
                    Column("genres.id"),
                )
            )
        }
    ]

    // WHERE released >= 2000
    where: Some(
        GreaterThanOrEqual(
            Column("released"),
            Integer(2000),
        )
    )

    // ORDER BY released
    order: [
        (Column("released"), Ascending),
    ]
}
```

The first thing `Planner::build_select` does is to create an empty scope (which will track column
names and indexes) and build the `FROM` clause which will generate the initial input rows:

https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L170-L179

https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L283-L289

`Planner::build_from()` first encounters the `ast::From::Join` item, which joins `movies` and
`genres`. This will build a `Node::NestedLoopJoin` plan node for the join, which is the simplest and
most straightforward join algorithm -- it simply iterates over all rows in the `genres` table for
every row in the `movies` table and emits the joined rows (we'll see how to optimize it with a
better join algorithm later).

https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L319-L344

It first recurses into `Planner::build_from()` to build each of the `ast::From::Table` nodes for
each table.  This will look up the table schemas in the catalog, add them to the current scope, and
build a `Node::Scan` node which will emit all rows from each table. The `Node::Scan` nodes are
placed into the `Node::NestedLoopJoin` above.

https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L312-L317

While building the `Node::NestedLoopJoin`, it also needs to convert the join expression
`movies.genre_id = genres.id` into a proper `sql::types::Expression`. This is done by
`Planner::build_expression()`:

https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L493-L568

Expression building is mostly a direct translation from an `ast::Expression` variant to a
corresponding `sql::types::Expression` variant (for example from
`ast::Expression::Operator(ast::Operator::Equal)` to `sql::types::Expression::Equal`). However, as
mentioned earlier, `ast::Expression` contains column references by name, while
`sql::types::Expression` contains column references as row indexes. This name resolution is done
here, by looking up the column names in the scope:

https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L521-L523

The expression we're building is the join predicate of `Node::NestedLoopJoin`, so it operates on
joined rows containing all columns of `movies` then all columns of `genres`. It also operates on all
combinations of joined rows (the [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product)),
and the purpose of the join predicate is to determine which joined rows to actually keep. For
example, the full set of joined rows that are evaluated might be:

| movies.id | movies.title | movies.released | movies.genre_id | genres.id | genres.name |
|-----------|--------------|-----------------|-----------------|-----------|-------------|
| 1         | Sicario      | 2015            | 2               | 1         | Drama       |
| 2         | Sicario      | 2015            | 2               | 2         | Action      |
| 3         | 21 Grams     | 2003            | 1               | 1         | Drama       |
| 4         | 21 Grams     | 2003            | 1               | 2         | Action      |
| 5         | Heat         | 1995            | 2               | 1         | Drama       |
| 6         | Heat         | 1995            | 2               | 2         | Action      |

The join predicate should pick out the rows where `movies.genre_id = genres.id`. The scope will
reflect the column layout in the example above, and can resolve the column names to zero-based row
indexes as `#3 = #4`, which will be the final built `Expression`.

Now that we've built the `FROM` clause into a `Node::NestedLoopJoin` of two `Node::Scan` nodes, we
move on to the `WHERE` clause. This simply builds the `WHERE` expression `released >= 2000`, like
we've already seen with the join predicate, and creates a `Node::Filter` node which takes its input
rows from the `Node::NestedLoopJoin` and filters them by the given expression. Again, the scope
keeps track of which input columns we're getting from the join node and resolves the `released`
column reference in the expression.

https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L202-L206

We then build the `SELECT` clause, which emits the `title, released, genres.name AS genre` columns.
This is just a list of expressions that are built in the current scope and placed into a
`Node::Projection` (the expressions could be arbitrarily complex). However, we also have to make
sure to update the scope with the final three columns that are output to subsequent nodes, taking
into account the `genre` alias for the original `genres.name` column (we won't dwell on the "hidden
columns" mentioned there -- they're not relevant for our query).

https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L214-L234

Finally, we build the `ORDER BY` clause. Again, this just builds a trivial expression for `released`
and places it into an `Node::Order` node which takes input rows from the `Node::Projection` and
sorts them by the order expression.

https://github.com/erikgrinaker/toydb/blob/6f6cec4db10bc015a37ee47ff6c7dae383147dd5/src/sql/planner/planner.rs#L245-L252

And that's it. The `Node::Order` is placed into the root `Plan::Select`, and we have our final plan.

```
Select
└─ Order: movies.released desc
   └─ Projection: movies.title, movies.released, genres.name as genre
      └─ Filter: movies.released >= 2000
         └─ NestedLoopJoin: inner on movies.genre_id = genres.id
            ├─ Scan: movies
            └─ Scan: genres
```

We'll see how to execute it soon, but first we should optimize it to see if we can make it run
faster -- in particular, to see if we can avoid reading all movies from storage, and if we can do
better than the very slow nested loop join.

---

<p align="center">
← <a href="sql-parser.md">SQL Parsing</a> &nbsp; | &nbsp; <a href="sql-optimizer.md">SQL Optimization</a> →
</p>

================================================
FILE: docs/architecture/sql-raft.md
================================================
# SQL Raft Replication

toyDB uses Raft to replicate SQL storage across a cluster of nodes (see the Raft section for
details). All nodes will store a full copy of the SQL database, and the Raft leader will replicate
writes across nodes and execute reads.

Recall the Raft state machine interface `raft::State`:

https://github.com/erikgrinaker/toydb/blob/8782c2b05f11333c1586ef248f1a13dc1c8dec4a/src/raft/state.rs#L4-L51

In toyDB, the state machine is just a `sql::engine::Local` storage engine with a thin wrapper:

https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L278-L291

Raft will submit read and write commands to this state machine as binary `Vec<u8>` data, so we have
to represent the methods of `sql::engine::Engine` as binary Raft commands. We do this as two
enums, `sql::engine::raft::Read` and `sql::engine::raft::Write`, which we'll Bincode-encode:

https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L16-L71

Notice that almost all requests include a `mvcc::TransactionState`. Most of the useful methods of
`sql::engine::Engine` are on the `sql::engine::Transaction`, but unlike the `Local` engine, below
Raft we can't hold on to a `Transaction` object in memory between each command -- nodes may restart
and leadership may move, and we want client transactions to keep working despite this. Instead, we
will use the client-supplied `mvcc::TransactionState` to reconstruct a `Transaction` for every
command via `mvcc::Transaction::resume()` and call methods on it.

When the state machine receives a write command, it decodes it as a `Write` and calls the
appropriate `Local` method. The result is Bincode-encoded and returned to the caller, who knows what
return type to expect for a given command. The state machine also keeps track of the Raft applied
index of each command as a separate key in the key/value store.

https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L346-L367

https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L306-L338

Similarly, read commands are decoded as a `Read` and the appropriate `Local` method is called:

https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L369-L404

That's the state machine running below Raft. But how do we actually send these commands to Raft and
receive results? That's handled by the `sql::engine::Raft` implementation, which uses a channel to
send requests to the local Raft node (we'll see how this plumbing works in the server section):

https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L80-L95

The channel takes a `raft::Request` containing binary Raft client requests and a return channel
where the Raft node can send back a `raft::Response`. The Raft engine has a few convenience methods
to send requests and receive responses, for both read and write requests:

https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L114-L135

And the implementation of the `sql::engine::Engine` and `sql::engine::Transaction` traits simply
send these requests via Raft:

https://github.com/erikgrinaker/toydb/blob/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/engine/raft.rs#L194-L276

One thing to note here is that we don't support streaming data via Raft, so e.g. the
`Transaction::scan` method will buffer the entire result in a `Vec`. With a full table scan, this
will load the entire table into memory -- that's unfortunate, but we keep it simple.

To summarize, this is what happens when `Transaction::insert()` is called to insert a row via Raft:

1. `sql::engine::raft::Transaction::insert()`: called to insert a row.
2. `sql::engine::raft::Write::Insert`: enum representation of the insert command.
3. `raft::Request::Write`: raft request containing the Bincode-encoded `Write::Insert` command.
4. `sql::engine::raft::Engine::tx`: sends the `Request::Write` and response channel to Raft.
5. `raft::Node::step()`: the `Request::Write` is given to Raft in a `Message::ClientRequest`.
6. Raft does its replication thing, and commits the command's log entry.
7. `raft::State::apply()`: the Bincode-encoded `Write::Insert` is passed to the state machine.
8. `sql::engine::raft::State::apply()`: decodes the command to a `Write::Insert`.
9. `sql::engine::raft::State::local`: contains the `Local` engine on each node.
10. `sql::engine::local::Engine::resume()`: called to obtain the SQL/MVCC transaction.
11. `sql::engine::local::Transaction::insert()`: the row is inserted to the local engine.
12. `raft::RawNode::tx`: the `Ok(())` result is sent as a Bincode-encoded `Message::ClientResponse`.
13. `sql::engine::raft::Transaction::insert()`: receives the result and returns it to the caller.

The plumbing here will be covered in more details in the server section.

---

<p align="center">
← <a href="sql-storage.md">SQL Storage</a> &nbsp; | &nbsp; <a href="sql-parser.md">SQL Parsing</a> →
</p>

================================================
FILE: docs/architecture/sql-storage.md
================================================
# SQL Storage

The SQL storage engine, in the [`sql::engine`](https://github.com/erikgrinaker/toydb/tree/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/sql/engine)
module, stores tables and rows. toyDB has two SQL storage implementations:

* `sql::engine::Local`: local storage using a `storage::Engine` key/value store.
* `sql::engine::Raft`: Raft-replicated storage, using `Local` on each node below Raft.

These implement the `sql::engine::Engine` trait, which specifies the SQL storage API. SQL execution
can use either simple local storage or Raft-replicated storage -- toyDB itself always uses the
Raft-replicated engine, but many tests use a local in-memory engine.

The `sql::engine::Engine` trait is fully transactional, based on the `storage::MVCC` transaction
engine discussed previously. As such, the trait just has a few methods that begin transactions --
the storage logic itself is implemented in the transaction, which we'll cover in next. The trait
also has a `session()` method to start SQL sessions for query execution, which we'll revisit in the
execution section.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/engine/engine.rs#L9-L29

Here, we'll only look at the `Local` engine, and we'll discuss Raft replication afterwards. `Local`
itself is just a thin wrapper around a `storage::MVCC<storage::Engine>` to create transactions:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L50-L97

## Key/Value Representation

`Local` uses a `storage::Engine` key/value store to store SQL table schemas, table rows, and
secondary index entries. But how do we represent these as keys and values?

The keys are represented by the `sql::engine::Key` enum, and encoded using the Keycode encoding
that we've discussed in the encoding section:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L15-L31

The values are encoded using the Bincode encoding, where the value type is given by the key:

* `Key::Table` → `sql::types::Table` (table schemas)
* `Key::Index` → `BTreeSet<sql::types::Value>` (indexed primary keys)
* `Key::Row` → `sql::types::Row` (table rows)

Recall that the Keycode encoding will store keys in sorted order. This means that all `Key::Table`
entries come first, then all `Key::Index`, then all `Key::Row`. These are further grouped and
sorted by their fields.

For example, consider these SQL tables containing movies and genres, with a secondary index on
`movies.genre_id` for fast lookups of movies with a given genre:

```sql
CREATE TABLE genres (
    id INTEGER PRIMARY KEY,
    name STRING NOT NULL
);

CREATE TABLE movies (
    id INTEGER PRIMARY KEY,
    title STRING NOT NULL,
    released INTEGER NOT NULL,
    genre_id INTEGER NOT NULL INDEX REFERENCES genres
);

INSERT INTO genres VALUES (1, 'Drama'), (2, 'Action');

INSERT INTO movies VALUES
    (1, 'Sicario', 2015, 2),
    (2, '21 Grams', 2003, 1),
    (3, 'Heat', 1995, 2);
```

This would result in the following illustrated keys and values, in the given order:

```
/Table/genres → Table { name: "genres", primary_key: 0, columns: ... }
/Table/movies → Table { name: "movies", primary_key: 0, columns: ... }
/Index/movies/genre_id/Integer(1) → BTreeSet { Integer(2) }
/Index/movies/genre_id/Integer(2) → BTreeSet { Integer(1), Integer(3) }
/Row/genres/Integer(1) → Row { Integer(1), String("Action") }
/Row/genres/Integer(2) → Row { Integer(2), String("Drama") }
/Row/movies/Integer(1) → Row { Integer(1), String("Sicario"), Integer(2015), Integer(2) }
/Row/movies/Integer(2) → Row { Integer(2), String("21 Grams"), Integer(2003), Integer(1) }
/Row/movies/Integer(3) → Row { Integer(3), String("Heat"), Integer(1995), Integer(2) }
```

Thus, if we want to do a full table scan of the `movies` table, we just do a prefix scan of
`/Row/movies/`. If we want to do a secondary index lookup of all movies with `genre_id = 2`, we
fetch `/Index/movies/genre_id/Integer(2)` and find that movies with `id = {1,3}` have this genre.

To help with prefix scans, the valid key prefixes are represented as `sql::engine::KeyPrefix`:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L35-L48

For a look at the actual on-disk binary storage format, see the test scripts under
[`src/sql/testscripts/writes`](https://github.com/erikgrinaker/toydb/tree/c2b0f7f1d6cbf6e2cdc09fc0aec7b050e840ec21/src/sql/testscripts/writes),
which output the logical and raw binary representation of write operations.

## Schema Catalog

The `sql::engine::Catalog` trait is used to store table schemas, i.e. `sql::types::Table`. It has a
handful of methods for creating, dropping and fetching tables (recall that toyDB does not support
schema changes). The `Table::name` field is used as a unique table identifier throughout.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/engine/engine.rs#L60-L79

The `Catalog` trait is also fully transactional, as it must be implemented on a transaction via the
`type Transaction: Transaction + Catalog` trait bound on `sql::engine::Engine`.

Creating a table is straightforward: insert a key/value pair with a Keycode-encoded `Key::Table`
for the key and a Bincode-encoded `sql::types::Table` for the value. We first check that the
table doesn't already exist, and validate the table schema using `Table::validate()`.

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L340-L347

Similarly, fetching and listing tables is straightforward: just key/value gets or scans using the
appropriate keys.

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L390-L399

Dropping tables is a bit more involved, since we have to perform some validation and also delete the
actual table rows and any secondary index entries, but it's not terribly complicated:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L349-L388

## Row Storage and Transactions

The workhorse of the SQL storage engine is the `Transaction` trait, which provides
[CRUD](https://en.wikipedia.org/wiki/Create,_read,_update_and_delete) operations (create, read,
update, delete) on table rows and secondary index entries. For performance (especially with Raft),
it operates on row batches rather than individual rows.

https://github.com/erikgrinaker/toydb/blob/0839215770e31f1e693d5cccf20a68210deaaa3f/src/sql/engine/engine.rs#L31-L58

The `Local::Transaction` implementation is just a wrapper around an MVCC transaction, and the
commit/rollback methods just call straight through to it:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L99-L102

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L182-L192

To insert new rows into a table, we first have to perform some validation: check that the table
exists and validate the rows against the table schema (including checking for e.g. primary key
conflicts and foreign key references). We then store the rows as a key/value pairs, using a
`Key::Row` with the table name and primary key value. And finally, we update secondary index entries
(if any).

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L252-L268

Row updates are similar to inserts, but in the case of a primary key change we instead delete the
old row and insert a new one, for simplicity. Secondary index updates also have to update both the
old and new entries.

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L296-L337

Row deletions are also similar: validate that the deletion is safe (e.g. check that there are no
foreign key references to it), then delete the `Key::Row` keys and any secondary index entries:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L194-L246

To fetch rows by primary key, we simply call through to key/value gets using the appropriate
`Key::Row`:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L248-L250

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L127-L133

Similarly, index lookups fetch a `Key::Index` for the indexed value, returning matching primary
keys:

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L270-L273

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L115-L125

Scanning table rows just performs a prefix scan with the appropriate `KeyPrefix::Row`, returning a
row iterator. This can optionally also do row filtering via filter pushdowns, which we'll revisit
when we look at the SQL optimizer.

https://github.com/erikgrinaker/toydb/blob/39c6b60afc4c235f19113dc98087176748fa091d/src/sql/engine/local.rs#L275-L294

And with that, we can now store and retrieve SQL tables and rows on disk. Let's see how to replicate
it across nodes via Raft.

---

<p align="center">
← <a href="sql-data.md">SQL Data Model</a> &nbsp; | &nbsp; <a href="sql-raft.md">SQL Raft Replication</a> →
</p>

================================================
FILE: docs/architecture/sql.md
================================================
# SQL Engine

The SQL engine provides support for the SQL query language, and is the main database interface. It
uses a key/value store for data storage, MVCC for transactions, and Raft for replication. The SQL
engine itself consists of several distinct components that form a pipeline:

> Client → Session → Lexer → Parser → Planner → Optimizer → Executor → Storage

The SQL engine is located in the [`sql`](https://github.com/erikgrinaker/toydb/tree/b2fe7b76ee634ca6ad31616becabfddb1c03d34b/src/sql)
module. We'll discuss each of the components in a bottom-up manner.

The SQL engine is tested as a whole by test scripts under
[`src/sql/testscripts`](https://github.com/erikgrinaker/toydb/tree/9419bcf6aededf0e20b4e7485e2a5fa3e975d79f/src/sql/testscripts).
These typically take a raw SQL string as input, execute them against an in-memory storage engine,
and output the result along with intermediate state such as the query plan, storage operations,
and binary key/value data.

---

<p align="center">
← <a href="raft.md">Raft Consensus</a> &nbsp; | &nbsp; <a href="sql-data.md">SQL Data Model</a> →
</p>

================================================
FILE: docs/architecture/storage.md
================================================
# Storage Engine

toyDB uses an embedded [key/value store](https://en.wikipedia.org/wiki/Key–value_database) for data
storage, located in the [`storage`](https://github.com/erikgrinaker/toydb/tree/213e5c02b09f1a3cac6a8bbd0a81773462f367f5/src/storage)
module. This stores arbitrary keys and values as binary byte strings. The storage engine doesn't
know or care what the keys and values contain -- we'll see later how the SQL data model, with tables
and rows, is mapped onto this key/value structure.

The storage engine supports simple set/get/delete operations on individual keys. It does not itself
support transactions -- this is built on top, and we'll get back to it shortly.

Keys are stored in sorted order. This allows range scans, where we can iterate over all key/value
pairs between two specific keys, or with a specific key prefix. This will be needed by other
components in the system, e.g. to scan all rows in a specific SQL table, to scan all versions of an
MVCC key, to scan the tail of the Raft log, etc.

The storage engine is pluggable: there are multiple implementations, and the user can choose which
one to use in the config file. These implement the `storage::Engine` trait:

https://github.com/erikgrinaker/toydb/blob/4804df254034c51f367d1380d389d80695cd7054/src/storage/engine.rs#L8-L58

Let's look at the existing storage engine implementations.

## `Memory` Storage Engine

The simplest storage engine is the `storage::Memory` engine. This is a trivial implementation which
stores data in memory using the Rust standard library's
[`BTreeMap`](https://doc.rust-lang.org/std/collections/struct.BTreeMap.html), without persisting
it to disk. It is primarily used for testing.

Since this is just a wrapper around the `BTreeMap` we can include it in its entirety here:

https://github.com/erikgrinaker/toydb/blob/8f8eae0dcf70b1a0df2e853b1f6600e0c7075340/src/storage/memory.rs#L8-L77

## `BitCask` Storage Engine

The main storage engine is `storage::BitCask`. This is a very simple variant of
[BitCask](https://riak.com/assets/bitcask-intro.pdf), used in the [Riak](https://riak.com/)
database. It is kind of like the [LSM-tree](https://en.wikipedia.org/wiki/Log-structured_merge-tree)'s
baby cousin.

https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L15-L55

toyDB's BitCask implementation uses a single append-only log file for storage. To write a key/value
pair, we simply append it to the file. To delete a key, we append a special tombstone value. When
reading a key, the last entry for that key in the file is used.

The file format for a key/value pair is simply:

1. The key length, as a big-endian `u32` (4 bytes).
2. The value length, as a big-endian `i32` (4 bytes). -1 if tombstone.
3. The binary key (n bytes).
4. The binary value (n bytes).

For example, the key/value pair `foo=bar` would be written as follows (in hexadecimal):

```
keylen   valuelen key    value
00000003 00000003 666f6f 626172
```

Because the data file is a simple log, we don't need a separate [write-ahead log](https://en.wikipedia.org/wiki/Write-ahead_logging)
for crash recovery -- the data file _is_ the write-ahead log.

To quickly look up key/value pairs when reading, we maintain an in-memory `KeyDir` index which maps
a key to the latest value's position in the file. All keys must therefore fit in memory.

https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L57-L65

We initially generate this index by scanning through the entire file when it is opened:

https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L267-L332

To write a key, we append it to the file and update the `KeyDir`:

https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L155-L159

https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L342-L366

To delete a key, we append a tombstone value instead:

https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L122-L126

To read a value for a key, we look up the key's file location in the `KeyDir` index (if the key
exists), and then read it from the file:

https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L334-L340

The `KeyDir` uses an inner stdlib `BTreeMap` to keep track of keys. This allows range scans, where
we iterate over a sorted set of keys between the range bounds, loading each key from the file:

https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L144-L146

https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L207-L225

As keys are updated and deleted, we'll keep accumulating old versions in the log file. To remove
these, the log file is compacted on startup. This writes out the latest value of every live
key/value pair to a new file, and replaces the old file. The keys are written in sorted order, to
make later scans faster.

https://github.com/erikgrinaker/toydb/blob/3e467512dca55843f0b071b3e239f14724f59a41/src/storage/bitcask.rs#L172-L195

---

<p align="center">
← <a href="overview.md">Overview</a> &nbsp; | &nbsp; <a href="encoding.md">Key/Value Encoding</a> →
</p>

================================================
FILE: docs/architecture.md
================================================
Moved to [`architecture/index.md`](architecture/index.md).

================================================
FILE: docs/crate/Cargo.toml
================================================
[package]
name = "toydb"
version = "1.0.1"
description = "A simple distributed SQL database, built for education"
authors = ["Erik Grinaker <erik@grinaker.org>"]
license = "Apache-2.0"
homepage = "https://github.com/erikgrinaker/toydb"
repository = "https://github.com/erikgrinaker/toydb"
edition = "2024"


================================================
FILE: docs/crate/README.md
================================================
# toyDB

toyDB is a distributed SQL database in Rust, built from scratch as an educational project. Main
features:

* Raft distributed consensus for linearizable state machine replication.

* ACID transactions with MVCC-based snapshot isolation.

* Pluggable storage engine with BitCask and in-memory backends.

* Iterator-based query engine with heuristic optimization and time-travel  support.

* SQL interface including joins, aggregates, and transactions.

toyDB is not distributed as a crate, see <https://github.com/erikgrinaker/toydb> for more.

This crate used to contain the [joydb](https://crates.io/crates/joydb) database. Thanks to Serhii
Potapov for donating the crate name.

================================================
FILE: docs/crate/src/lib.rs
================================================
//! This crate is just a simple README.md placeholder. toydb is not intended to be used as a
//! library, and is not distributed as a crate. See <https://github.com/erikgrinaker/toydb>.


================================================
FILE: docs/examples.md
================================================
# SQL Examples

The following examples demonstrate some of toyDB's SQL features. For more details, see the
[SQL reference](sql.md).

- [Setup](#setup)
- [Creating Tables and Data](#creating-tables-and-data)
- [Constraints and Referential Integrity](#constraints-and-referential-integrity)
- [Basic SQL Queries](#basic-sql-queries)
- [Expressions](#expressions)
- [Joins](#joins)
- [Explain](#explain)
- [Aggregates](#aggregates)
- [Transactions](#transactions)
- [Time-Travel Queries](#time-travel-queries)

## Setup

To start a five-node cluster on the local machine (requires a working
[Rust compiler](https://www.rust-lang.org/tools/install)), run:

```
$ ./cluster/run.sh
toydb2 19:06:28 [ INFO] Listening on 0.0.0.0:9602 (SQL) and 0.0.0.0:9702 (Raft)
toydb2 19:06:28 [ERROR] Failed connecting to Raft peer 127.0.0.1:9705: Connection refused
toydb5 19:06:28 [ INFO] Listening on 0.0.0.0:9605 (SQL) and 0.0.0.0:9705 (Raft)
[...]
toydb5 19:06:29 [ INFO] Voting for toydb-d in term 1 election
toydb3 19:06:29 [ INFO] Voting for toydb-d in term 1 election
toydb4 19:06:29 [ INFO] Won election for term 1, becoming leader
```

In a separate terminal, start a `toysql` client and check the server status:

```
$ cargo run --release --bin toysql
Connected to toyDB node "toydb-a". Enter !help for instructions.
toydb> !status

Server:    5 (leader 4 in term 1 with 5 nodes)
Raft log:  1 committed, 0 applied, 0.000 MB (hybrid storage)
Node logs: 1:1 2:1 3:1 4:1 5:1
SQL txns:  0 active, 0 total (bitcask storage)
```

The cluster is shut down by pressing Ctrl-C. Data is saved under `clusters/toydb-?/data/`,
delete the contents to start over.

## Creating Tables and Data

As a basis for later examples, we'll create a small movie database. The following SQL statements
can be pasted into `toysql`:

```sql
CREATE TABLE genres (
    id INTEGER PRIMARY KEY,
    name STRING NOT NULL
);
INSERT INTO genres VALUES
    (1, 'Science Fiction'),
    (2, 'Action'),
    (3, 'Drama'),
    (4, 'Comedy');

CREATE TABLE studios (
    id INTEGER PRIMARY KEY,
    name STRING NOT NULL
);
INSERT INTO studios VALUES
    (1, 'Mosfilm'),
    (2, 'Lionsgate'),
    (3, 'StudioCanal'),
    (4, 'Warner Bros'),
    (5, 'Focus Features');

CREATE TABLE movies (
    id INTEGER PRIMARY KEY,
    title STRING NOT NULL,
    studio_id INTEGER NOT NULL INDEX REFERENCES studios,
    genre_id INTEGER NOT NULL INDEX REFERENCES genres,
    released INTEGER NOT NULL,
    rating FLOAT
);
INSERT INTO movies VALUES
    (1,  'Stalker',             1, 1, 1979, 8.2),
    (2,  'Sicario',             2, 2, 2015, 7.6),
    (3,  'Primer',              3, 1, 2004, 6.9),
    (4,  'Heat',                4, 2, 1995, 8.2),
    (5,  'The Fountain',        4, 1, 2006, 7.2),
    (6,  'Solaris',             1, 1, 1972, 8.1),
    (7,  'Gravity',             4, 1, 2013, 7.7),
    (8,  '21 Grams',            5, 3, 2003, 7.7),
    (9,  'Birdman',             4, 4, 2014, 7.7),
    (10, 'Inception',           4, 1, 2010, 8.8),
    (11, 'Lost in Translation', 5, 4, 2003, 7.7),
    (12, 'Eternal Sunshine of the Spotless Mind', 5, 3, 2004, 8.3);
```

toyDB supports some basic datatypes, as well as primary keys, foreign keys, and column indexes.
For more information on these, see the [SQL reference](sql.md). Schema changes such as
`ALTER TABLE` are not supported, only `CREATE TABLE` and `DROP TABLE`.

The tables can be inspected via the `!tables` and `!table` commands:

```sql
toydb> !tables
genres
movies
studios

toydb> !table genres
CREATE TABLE genres (
  id INTEGER PRIMARY KEY,
  name STRING NOT NULL
)
```

## Constraints and Referential Integrity

Schemas enforce referential integrity and other constraints:

```sql
toydb> DROP TABLE studios;
Error: Table studios is referenced by table movies column studio_id

toydb> DELETE FROM studios WHERE id = 1;
Error: Primary key 1 is referenced by table movies column studio_id

toydb> UPDATE movies SET id = 1;
Error: Primary key 1 already exists for table movies

toydb> INSERT INTO movies VALUES (13, 'Nebraska', 6, 3, 2013, 7.7);
Error: Referenced primary key 6 in table studios does not exist

toydb> INSERT INTO movies VALUES (13, 'Nebraska', NULL, 3, 2013, 7.7);
Error: NULL value not allowed for column studio_id

toydb> INSERT INTO movies VALUES (13, 'Nebraska', 'Unknown', 3, 2013, 7.7);
Error: Invalid datatype STRING for INTEGER column studio_id
```

## Basic SQL Queries

Most basic SQL query functionality is supported:

```sql
toydb> SELECT * FROM studios;
1|Mosfilm
2|Lionsgate
3|StudioCanal
4|Warner Bros
5|Focus Features

toydb> SELECT title, rating FROM movies WHERE released >= 2000 ORDER BY rating DESC LIMIT 3;
Inception|8.8
Eternal Sunshine of the Spotless Mind|8.3
Gravity|7.7
```

Column headers can be enabled with `!headers on`:

```sql
toydb> !headers on
Headers enabled

toydb> SELECT id, name AS genre FROM genres;
id|genre
1|Science Fiction
2|Action
3|Drama
4|Comedy
```

## Expressions

All common mathematical operators are implemented:

```sql
toydb> SELECT 1 + 2 * 3;
7

toydb> SELECT (1 + 2) * 4 / -3;
-4

SELECT 3! + 7 % 4 - 2 ^ 3;
1
```

64-bit floating point arithmetic is also supported, including infinity and NaN:

```sql
toydb> SELECT 3.14 * 2.718;
8.53452

toydb> SELECT 1.0 / 0.0;
inf

toydb> SELECT 1e10 ^ 8;
100000000000000000000000000000000000000000000000000000000000000000000000000000000

toydb> SELECT 1e10 ^ 8 / INFINITY, 1e10 ^ 1e10, INFINITY / INFINITY;
0|inf|NaN
```

And of course three-valued logic:

```sql
toydb> SELECT TRUE AND TRUE, TRUE AND FALSE, TRUE AND NULL, FALSE AND NULL;
TRUE|FALSE|NULL|FALSE

toydb> SELECT TRUE OR FALSE, FALSE OR FALSE, TRUE OR NULL, FALSE OR NULL;
TRUE|FALSE|TRUE|NULL

toydb> SELECT NOT TRUE, NOT FALSE, NOT NULL;
FALSE|TRUE|NULL
```

Which would be useless without comparison operators for all types:

```sql
toydb> SELECT 3 > 1, 3 <= 1, 3 = 3.0;
TRUE|FALSE|TRUE

toydb> SELECT 'a' = 'A', 'foo' > 'bar', '👍' != '👎';
FALSE|TRUE|TRUE

toydb> SELECT INFINITY > -INFINITY, NULL = NULL;
TRUE|NULL
```

## Joins

No SQL database would be complete without joins, and toyDB supports most join types such as
inner joins (both implicit and explicit):

```sql
toydb> SELECT m.id, m.title, g.name FROM movies m JOIN genres g ON m.genre_id = g.id LIMIT 4;
1|Stalker|Science Fiction
2|Sicario|Action
3|Primer|Science Fiction
4|Heat|Action

toydb> SELECT m.id, m.title, g.name FROM movies m, genres g WHERE m.genre_id = g.id LIMIT 4;
1|Stalker|Science Fiction
2|Sicario|Action
3|Primer|Science Fiction
4|Heat|Action
```

Left and right outer joins:

```sql
toydb> SELECT s.id, s.name, g.name FROM studios s LEFT JOIN genres g ON s.id = g.id;
1|Mosfilm|Science Fiction
2|Lionsgate|Action
3|StudioCanal|Drama
4|Warner Bros|Comedy
5|Focus Features|NULL

toydb> SELECT g.id, g.name, s.name FROM genres g RIGHT JOIN studios s ON g.id = s.id;
1|Science Fiction|Mosfilm
2|Action|Lionsgate
3|Drama|StudioCanal
4|Comedy|Warner Bros
NULL|NULL|Focus Features
```

And cross joins (both implicit and explicit):

```sql
toydb> SELECT g.name, s.name FROM genres g, studios s WHERE s.name < 'S';
Science Fiction|Mosfilm
Science Fiction|Lionsgate
Science Fiction|Focus Features
Action|Mosfilm
Action|Lionsgate
Action|Focus Features
Drama|Mosfilm
Drama|Lionsgate
Drama|Focus Features
Comedy|Mosfilm
Comedy|Lionsgate
Comedy|Focus Features
```

We can join on arbitrary predicates, such as joining movies with any genres whose name is
ordered after the movie's title:

```sql
toydb>  SELECT   m.title, g.name
        FROM     movies m JOIN genres g ON g.name > m.title
        ORDER BY m.title, g.name;

21 Grams|Action
21 Grams|Comedy
21 Grams|Drama
21 Grams|Science Fiction
Birdman|Comedy
Birdman|Drama
Birdman|Science Fiction
Eternal Sunshine of the Spotless Mind|Science Fiction
Gravity|Science Fiction
Heat|Science Fiction
Inception|Science Fiction
Lost in Translation|Science Fiction
Primer|Science Fiction
```

And we can join multiple tables, even using the same table multiple times - like in this example
where we find all science fiction movies released since 2000 by studios that have released any 
movie rated 8 or higher:

```sql
toydb> SELECT   m.id, m.title, g.name AS genre, m.released, s.name AS studio
       FROM     movies m JOIN genres g ON m.genre_id = g.id,
                studios s JOIN movies good ON good.studio_id = s.id AND good.rating >= 8
       WHERE    m.studio_id = s.id AND m.released >= 2000 AND g.id = 1
       ORDER BY m.title ASC;

7|Gravity|Science Fiction|2013|Warner Bros
10|Inception|Science Fiction|2010|Warner Bros
5|The Fountain|Science Fiction|2006|Warner Bros
```

## Explain

When optimizing complex queries with several joins, it can often be useful to inspect the query
plan via an `EXPLAIN` query:

```sql
toydb> EXPLAIN
       SELECT   m.id, m.title, g.name AS genre, m.released, s.name AS studio
       FROM     movies m JOIN genres g ON m.genre_id = g.id,
                studios s JOIN movies good ON good.studio_id = s.id AND good.rating >= 8
       WHERE    m.studio_id = s.id AND m.released >= 2000 AND g.id = 1
       ORDER BY m.title ASC;

Order: m.title asc
└─ Projection: m.id, m.title, g.name, m.released, s.name
   └─ HashJoin: inner on m.studio_id = s.id
      ├─ HashJoin: inner on m.genre_id = g.id
      │  ├─ Filter: m.released > 2000 OR m.released = 2000
      │  │  └─ IndexLookup: movies as m column genre_id (1)
      │  └─ KeyLookup: genres as g (1)
      └─ HashJoin: inner on s.id = good.studio_id
         ├─ Scan: studios as s
         └─ Scan: movies as good (good.rating > 8 OR good.rating = 8)
```

Here, we can see that the planner does a primary key lookup on `genres` and an index lookup on
`movies.genre_id`, filtering the resulting movies by release year and joining them. It also
does full table scans of `studios` and `movies` (to find the good movies) and joins them, pusing
the `rating >= 8` filter down to the `movies` table scan. The results of these two joins are also
joined to produce the final result, which is then formatted and sorted.

## Aggregates

Most basic aggregate functions are supported:

```sql
toydb> SELECT COUNT(*), MIN(rating), MAX(rating), AVG(rating), SUM(rating) FROM movies;
12|6.9|8.8|7.841666666666668|94.10000000000001
```

We can group by values and filter the aggregate results:

```sql
toydb> SELECT s.id, s.name, AVG(m.rating) AS average
       FROM movies m JOIN studios s ON m.studio_id = s.id
       GROUP BY s.id, s.name
       HAVING average > 7.8
       ORDER BY average DESC, s.name ASC;
1|Mosfilm|8.149999999999999
4|Warner Bros|7.919999999999999
5|Focus Features|7.900000000000001
```

And we can combine aggregate functions with arbitrary expressions, both inside and outside:

```sql
toydb> SELECT s.id, s.name, ((MAX(rating^2) - MIN(rating^2)) / AVG(rating^2)) ^ (0.5) AS spread
       FROM movies m JOIN studios s ON m.studio_id = s.id
       GROUP BY s.id, s.name
       HAVING MAX(rating) - MIN(rating) > 0.5
       ORDER BY spread DESC;
4|Warner Bros|0.6373540990222496
5|Focus Features|0.39194971607693424
```

## Transactions

toyDB supports ACID transactions via MVCC-based snapshot isolation. This provides atomic
transactions with good isolation, without taking out locks or blocking reads on writes. As a basic
example, the below transaction is rolled back without taking effect, as opposed to `COMMIT`
which would make it permanent:

```sql
toydb> BEGIN;
Began transaction 131

toydb:131> INSERT INTO genres VALUES (5, 'Western');
toydb:131> SELECT * FROM genres;
1|Science Fiction
2|Action
3|Drama
4|Comedy
5|Western
toydb:131> ROLLBACK;
Rolled back transaction 131

toydb> SELECT * FROM genres;
1|Science Fiction
2|Action
3|Drama
4|Comedy
```

We'll demonstrate transactions by covering most common transaction anomalies given two
concurrent sessions, and show how toyDB prevents these anomalies in all cases but one. In these
examples, the left half is user A and the right is user B. Time flows downwards such that
commands on the same line happen at the same time.

**Dirty write:** an uncommitted write by A should not be affected by a concurrent B write.

```sql
a> BEGIN;
a> INSERT INTO genres VALUES (5, 'Western');
                                                   b> INSERT INTO genres VALUES (5, 'Romance');
                                                   Error: Serialization failure, retry transaction
a> SELECT * FROM genres WHERE id = 5;
5|Western
```

The serialization failure here occurs because the first write always wins. This may not be an
optimal strategy, but it is correct in terms of preventing serialization anomalies.

**Dirty read:** an uncommitted write by A should not be visible to B until committed.

```sql
a> BEGIN;
a> INSERT INTO genres VALUES (5, 'Western');
                                                  b> SELECT * FROM genres WHERE id = 5;
                                                  No rows returned
a> COMMIT;
                                                  b> SELECT * FROM genres WHERE id = 5;
                                                  5|Western
```

**Lost update:** when A and B both read a value, before updating it in turn, the first write should
not be overwritten by the second.

```sql
a> BEGIN;                                         b> BEGIN;
a> SELECT title, rating FROM movies WHERE id = 2; b> SELECT title, rating FROM movies WHERE id = 2;
Sicario|7.6                                       Sicario|7.6
a> UPDATE movies SET rating = 7.8 WHERE id = 2;
                                                  b> UPDATE movies SET rating = 7.7 WHERE id = 2;
                                                  Error: Serialization failure, retry transaction
a> COMMIT;
```

**Fuzzy read:** B should not see a value suddenly change in its transaction, even if A commits a 
new value.

```sql
a> BEGIN;                                         b> BEGIN;
                                                  b> SELECT * FROM genres WHERE id = 1;
                                                  1|Science Fiction
a> UPDATE genres SET name = 'Scifi' WHERE id = 1;
a> COMMIT;
                                                  b> SELECT * FROM genres WHERE id = 1;
                                                  1|Science Fiction
                                                  b> COMMIT;

                                                  b> SELECT * FROM genres WHERE id = 1;
                                                  1|Scifi
```

**Read skew:** if A reads two values, and B modifies the second value in between the reads, A 
should see the old second value.

```sql
a> BEGIN;
a> SELECT * FROM genres WHERE id = 2;
2|Action
                                                  b> BEGIN;
                                                  b> UPDATE genres SET name = 'Drama' WHERE id = 2;
                                                  b> UPDATE genres SET name = 'Action' WHERE id = 3;
                                                  b> COMMIT;
a> SELECT * FROM genres WHERE id = 3;
3|Drama
```

**Phantom read:** when A runs a query with a predicate, and B commits a matching write, A should
not see the write when rerunning it.

```sql
a> BEGIN;
a> SELECT * FROM genres WHERE id > 2;
3|Drama
4|Comedy
                                                  b> INSERT INTO genres VALUES (5, 'Western');
a> SELECT * FROM genres WHERE id > 2;
3|Drama
4|Comedy
```

**Write skew:** when A reads row X and writes it to row Y, B should not concurrently be able to
read row Y and write it to row X.

```sql
a> BEGIN;                                         b> BEGIN;
a> SELECT * FROM genres WHERE id = 2;
2|Action
                                                  b> SELECT * FROM genres WHERE id = 3;
                                                  3|Drama
                                                  b> UPDATE genres SET name = 'Drama' WHERE id = 2;
a> UPDATE genres SET name = 'Action' WHERE id = 3;
a> COMMIT;                                        b> COMMIT;
```

Here, the writes actually go through. This anomaly is not protected against by snapshot isolation, 
and thus not by toyDB either - doing so would require implementing serializable snapshot isolation. 
However, this is the only common serialization anomaly not handled by toyDB, and is not among the
most severe.

## Time-Travel Queries

Since toyDB uses MVCC for transactions and keeps all historical versions, the state of the database
can be queried at any arbitrary point in the past. toyDB uses incremental transaction IDs as
logical timestamps:

```sql
toydb> SELECT * FROM genres;
1|Science Fiction
2|Drama
3|Action
4|Comedy

toydb> BEGIN;
Began transaction 173
toydb:173> UPDATE genres SET name = 'Scifi' WHERE id = 1;
toydb:173> INSERT INTO genres VALUES (5, 'Western');
toydb:173> COMMIT;
Committed transaction 173

toydb> SELECT * FROM genres;
1|Scifi
2|Drama
3|Action
4|Comedy
5|Western

toydb> BEGIN READ ONLY AS OF SYSTEM TIME 172;
Began read-only transaction 175 in snapshot at version 172
toydb@172> SELECT * FROM genres;
1|Science Fiction
2|Drama
3|Action
4|Comedy
```

================================================
FILE: docs/references.md
================================================
# References

This is the main research material I used while building toyDB. It is a subset of my
[reading list](https://github.com/erikgrinaker/readings).

## Introduction

Andy Pavlo's CMU lectures are an absolutely fantastic introduction to database internals:

- 🎥 [CMU 15-445 Intro to Database Systems](https://www.youtube.com/playlist?list=PLSE8ODhjZXjbohkNBWQs_otTrBTrjyohi) (A Pavlo 2019)
- 🎥 [CMU 15-721 Advanced Database Systems](https://www.youtube.com/playlist?list=PLSE8ODhjZXjasmrEd2_Yi1deeE360zv5O) (A Pavlo 2020)

Martin Kleppman has written an excellent overview of database technologies and concepts, while Alex
Petrov goes in depth on implementation of storage engines and distributed systems algorithms:

- 📖 [Designing Data-Intensive Applications](https://dataintensive.net/) (M Kleppmann 2017)
- 📖 [Database Internals](https://www.databass.dev) (A Petrov 2019)

## Raft

The Raft consensus algorithm is described in a very readable paper by Diego Ongaro, and in a talk
given by his advisor John Ousterhout:

- 📄 [In Search of an Understandable Consensus Algorithm](https://raft.github.io/raft.pdf) (D Ongaro, J Ousterhout 2014)
- 🎥 [Designing for Understandability: The Raft Consensus Algorithm](https://www.youtube.com/watch?v=vYp4LYbnnW8) (J Ousterhout 2016)

However, Raft has several subtle pitfalls, and Jon Gjengset's student guide was very helpful in
drawing attention to these:

- 🔗 [Students' Guide to Raft](https://thesquareplanet.com/blog/students-guide-to-raft/) (J Gjengset 2016)

## Parsing

Thorsten Ball has written a very enjoyable hands-on introduction to parsers where he implements
first an interpreter and then a compiler for the made-up Monkey programming language (in Go):

- 📖 [Writing An Interpreter In Go](https://interpreterbook.com) (T Ball 2016) 
- 📖 [Writing A Compiler In Go](https://compilerbook.com) (T Ball 2018)

The toyDB expression parser is inspired by a blog post by Eli Bendersky describing the precedence
climbing algorithm, which is the algorithm I found the most elegant:

- 💬 [Parsing Expressions by Precedence Climbing](https://eli.thegreenplace.net/2012/08/02/parsing-expressions-by-precedence-climbing) (E Bendersky 2012)

## Transactions

Jepsen (i.e. Kyle Kingsbury) has an excellent overview of consistency and isolation models, which 
is very helpful in making sense of the jungle of overlapping and ill-defined terms:

- 🔗 [Consistency Models](https://jepsen.io/consistency) (Jepsen 2016)

For more background on this, in particular on how snapshot isolation provided by the MVCC
transaction engine used in toyDB does not fit into the traditional SQL isolation levels, the
following classic papers were useful:

- 📄 [A Critique of ANSI SQL Isolation Levels](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-95-51.pdf) (H Berenson et al 1995)
- 📄 [Generalized Isolation Level Definitions](http://pmg.csail.mit.edu/papers/icde00.pdf) (A Adya, B Liskov, P ONeil 2000)

As for actually implementing MVCC, I found blog posts to be the most helpful:

- 💬 [Implementing Your Own Transactions with MVCC](https://levelup.gitconnected.com/implementing-your-own-transactions-with-mvcc-bba11cab8e70) (E Chance 2015)
- 💬 [How Postgres Makes Transactions Atomic](https://brandur.org/postgres-atomicity) (B Leach 2017)


================================================
FILE: docs/sql.md
================================================
# SQL Reference

## Data Types

The following data types are supported:

* `BOOLEAN` (`BOOL`): logical truth values, i.e. true and false.
* `FLOAT` (`DOUBLE`): 64-bit signed floating point numbers, using [IEEE 754 `binary64`](https://en.wikipedia.org/wiki/binary64) encoding. Supports magnitudes of 10⁻³⁰⁷ to 10³⁰⁸ with 53-bit precision (~15 significant figures), as well as the special values infinity and NaN.
* `INTEGER` (`INT`): 64-bit signed integer numbers with a range of ±2⁶³-1.
* `STRING` (`TEXT`, `VARCHAR`): UTF-8 encoded strings.

In addition, the special `NULL` value is used for an unknown value, following the rules of [three-valued logic](https://en.wikipedia.org/wiki/Three-valued_logic).

Numeric types are not interchangable; a float value (even without a fractional part) cannot be stored in an integer column and vice-versa.

## SQL Syntax

### Keywords

Keywords are reserved words with special meaning in SQL statements. They are case-insensitive, and must be quoted with `"` to be used as identifiers. The complete list is:

`AS`, `ASC`, `AND`, `BEGIN`, `BOOL`, `BOOLEAN`, `BY`, `COMMIT`, `CREATE`, `CROSS`, `DEFAULT`,`DELETE`, `DESC`, `DOUBLE`, `DROP`, `EXISTS`, `EXPLAIN`, `FALSE`, `FLOAT`, `FROM`, `GROUP`, `HAVING`, `IF`, `INDEX`, `INFINITY`, `INNER`, `INSERT`, `INT`, `INTEGER`, `INTO`, `IS`, `JOIN`, `KEY`, `LEFT`, `LIKE`, `LIMIT`, `NAN`, `NOT`, `NULL`, `OF`, `OFFSET`, `ON`, `ONLY`, `OR`, `ORDER`, `OUTER`, `PRIMARY`, `READ`, `REFERENCES`, `RIGHT`, `ROLLBACK`, `SELECT`, `SET`, `STRING`, `SYSTEM`, `TABLE`, `TEXT`, `TIME`, `TRANSACTION`, `TRUE`, `UNIQUE`, `UPDATE`, `VALUES`, `VARCHAR`, `WHERE`, `WRITE`

### Identifiers

Identifiers are names for database objects such as tables and columns. Unless quoted with `"`, they must begin with a Unicode letter followed by any combination of letters, numbers, and `_`, and cannot be reserved keywords. `""` can be used to escape a double quote character. They are always converted to lowercase.

### Constants

#### Named constants

The following keywords evaluate to constants:

* `FALSE`: the boolean false value.
* `INFINITY`: the floating-point value for infinity.
* `NAN`: the floating-point value for NaN (not a number).
* `NULL`: an unknown value.
* `TRUE`: the boolean true value.

#### String literals

String literals are surrounded by single quotes `'`, and can contain any valid UTF-8 character. Single quotes must be escaped by an additional single quote, i.e. `''`, no other escape sequences are supported. For example:

```
'A string with ''quotes'' and emojis 😀'
```

#### Numeric literals

Sequences of digits `0-9` are parsed as a 64-bit signed integer. Numbers with decimal points or in scientific notation are parsed as 64-bit floating point numbers. The following pattern is supported:

```
999[.[999]][e[+-]999]
```

The `-` prefix operator can be used to take negative numbers.

### Expressions

Expressions can be used wherever a value is expected, e.g. as `SELECT` columns nd `INSERT` values. They are made up of constants, a column references, an operator invocations, and a function calls.

Column references can either be unqualified, e.g. `name`, or prefixed with the relation identifier separated by `.`, e.g. `person.name`. Unqualified identifiers must be unambiguous.

## SQL Operators

### Logical operators

Logical operators apply standard logic operations on boolean operands.

* `AND`: the logical conjunction, e.g. `TRUE AND TRUE` yields `TRUE`.
* `OR`: the logical disjunction, e.g. `TRUE OR FALSE` yields `TRUE`.
* `NOT`: the logical negation, e.g. `NOT TRUE` yields `FALSE`.

The complete truth tables are:

| `AND`       | `TRUE`  | `FALSE` | `NULL`  |
|-------------|---------|---------|---------|
| **`TRUE`**  | `TRUE`  | `FALSE` | `NULL`  |
| **`FALSE`** | `FALSE` | `FALSE` | `FALSE` |
| **`NULL`**  | `NULL`  | `FALSE` | `NULL`  |

| `OR`        | `TRUE` | `FALSE` | `NULL` |
|-------------|--------|---------|--------|
| **`TRUE`**  | `TRUE` | `TRUE`  | `TRUE` |
| **`FALSE`** | `TRUE` | `FALSE` | `NULL` |
| **`NULL`**  | `TRUE` | `NULL`  | `NULL` |

| `NOT`       |         |
|-------------|---------|
| **`TRUE`**  | `FALSE` |
| **`FALSE`** | `TRUE`  |
| **`NULL`**  | `NULL`  |

### Comparison operators

Comparison operators compare values of the same data type, and return `TRUE` if the comparison holds or `FALSE` otherwise. `INTEGER` and `FLOAT` values are interchangeable. `STRING` comparisons use the string's byte values, i.e. case-sensitive with `'B' < 'a'` due to their UTF-8 code points. `FALSE` is considered lesser than `TRUE`. Comparison with `NULL` always yields `NULL` (even `NULL = NULL`).

Binary operators:

* `=`: equality, e.g. `1 = 1` yields `TRUE`.
* `!=`: inequality, e.g. `1 != 2` yields `TRUE`.
* `>`: greater than, e.g. `2 > 1` yields `TRUE`.
* `>=`: greater than or equal, e.g. `1 >= 1` yields `TRUE`.
* `<`: lesser than, e.g. `1 < 2` yields `TRUE`.
* `<=`: lesser than or equal, e.g. `1 <= 1` yields `TRUE`.

Unary operators:

* `IS NULL`: checks if the value is `NULL`, e.g. `NULL IS NULL` yields `TRUE`.
* `IS NOT NULL`: checks if the value is not `NULL`, e.g. `TRUE IS NOT NULL` yields `TRUE`.
* `IS NAN`: checks if the value is a float `NAN`, e.g. `NAN IS NAN` yields `TRUE`. Errors on 
  non-float datatypes, except `NULL` which yields `NULL`.
* `IS NOT NAN`: checks if the value is not a float `NAN`, e.g. `3.14 IS NOT NAN` yields `TRUE`.

### Mathematical operators

Mathematical operators apply standard math operations on numeric (`INTEGER` or `FLOAT`) operands. If either operand is a `FLOAT`, both operands are converted to `FLOAT` and the result is a `FLOAT`. If either operand is `NULL`, the result is `NULL`. The special values `INFINITY` and `NAN` are handled according to the IEEE 754 spec.

For `INTEGER` operands, failure conditions such as overflow and division by zero yield an error. For `FLOAT` operands, these return `INFINITY` or `NAN` as appropriate.

Binary operators:

* `+`: addition, e.g. `1 + 2` yields `3`.
* `-`: subtraction, e.g. `3 - 2` yields `1`.
* `*`: multiplication, e.g. `3 * 2` yields `6`.
* `/`: division, e.g. `6 / 2` yields `3`.
* `^`: exponentiation, e.g. `2 ^ 4` yields `16`.
* `%`: remainder, e.g. `8 % 3` yields `2`. Unlike modulo, the result has the sign of the dividend.

Unary operators:

* `+` (prefix): identity, e.g. `+1` yields `1`.
* `-` (prefix): negation, e.g. `- -2` yields `2`.
* `!` (postfix): factorial, e.g. `5!` yields `15`.

### String operators

String operators operate on string operands.

* `LIKE`: compares a string with the given pattern, using `%` as multi-character wildcard and `_` as single-character wildcard, returning `TRUE` if the string matches the pattern - e.g. `'abc' LIKE 'a%'` yields `TRUE`.

### Operator precedence

The operator precedence (order of operations) is as follows:

| Precedence | Operator                | Associativity |
|------------|-------------------------|---------------|
| 10         | `+`, `-` (prefix)       | Right         |
| 9          | `!` (postfix)           | Left          |
| 8          | `^`                     | Right         |
| 7          | `*`, `/`, `%`           | Left          |
| 6          | `+`, `-`                | Left          |
| 5          | `>`, `>=`, `<`, `<=`    | Left          |
| 4          | `=`, `!=`, `LIKE`, `IS` | Left          |
| 3          | `NOT`                   | Right         |
| 2          | `AND`                   | Left          |
| 1          | `OR`                    | Left          |

Precedence can be overridden by wrapping an expression in parentheses, e.g. `(1 + 2) * 3`.

### Functions

* `sqrt(expr)`: returns the square root of a numerical argument.

### Aggregate functions

Aggregate function aggregate an expression across all rows, optionally grouped into buckets given by `GROUP BY`, and results can be filtered via `HAVING`.

* `AVG(expr)`: returns the average of numerical values.

* `COUNT(expr)`: returns the number of rows for which ***`expr`*** evaluates to a non-`NULL` value. `COUNT(*)` can be used to count all rows.

* `MAX(expr)`: returns the maximum value, according to the datatype's ordering.

* `MIN(expr)`: returns the minimum value, according to the datatype's ordering.

* `SUM(expr)`: returns the sum of numerical values.

## SQL Statements

### `BEGIN`

Starts a new [transaction](#transactions).

<pre>
BEGIN [ TRANSACTION ] [ READ ONLY | READ WRITE ] [ AS OF SYSTEM TIME <b><i>txn_id</i></b> ]
</pre>

* ***`txn_id`***: A past transaction ID to run a read-only transaction for, for time-travel queries.

### `COMMIT`

Commits an active [transaction](#transactions).

### `CREATE TABLE`

Creates a new table.

<pre>
CREATE TABLE <b><i>table_name</i></b> (
    [ <b><i>column_name</i></b> <b><i>data_type</i></b> [ <b><i>column_constraint</i></b> [ ... ] ]  [ INDEX ] [, ... ] ]
)

where <b><i>column_constraint</i></b> is:

{ NOT NULL | NULL | PRIMARY KEY | DEFAULT <b><i>expr</i></b> | REFERENCES <b><i>ref_table</i></b> | UNIQUE }
</pre>

* ***`table_name`***: The name of the table. Must be a [valid identifier](#identifiers). Errors if a table with this name already exists.

* ***`column_name`***: The name of the column. Must be a [valid identifier](#identifiers), and unique within the table.

* ***`data_type`***: The data type of the column, see [data types](#data-types) for valid types.

* `NOT NULL`: The column may not contain `NULL` values.

* `NULL`: The column may contain `NULL` values. This is the default.

* `PRIMARY KEY`: The column should act as a primary key, i.e. the main row identifier. A table must have exactly one primary key column, and it must be unique and non-nullable.

* `DEFAULT`***`expr`***: Specifies a default value for the column when `INSERT` statements do not give a value. ***`expr`*** can be any constant expression of an appropriate data type, e.g. `'abc'` or `1 + 2 * 3`. For nullable columns, the default value is `NULL` unless specified otherwise.

* `REFERENCES`***`ref_table`***: The column is a foreign key to ***`ref_table`***'s primary key, enforcing referential integrity.

* `UNIQUE`: The column may only contain unique (distinct) values. `NULL` values are not considered equal, thus a `UNIQUE` column which allows `NULL` may contain multiple `NULL` values. `PRIMARY KEY` columns are implicitly `UNIQUE`.

* `INDEX`: Create an index for the column.

#### Example

```sql
CREATE TABLE movie (
    id INTEGER PRIMARY KEY,
    title STRING NOT NULL,
    release_year INTEGER INDEX,
    imdb_id STRING INDEX UNIQUE,
    bluray BOOLEAN NOT NULL DEFAULT TRUE
)
```

### `DELETE`

Deletes rows in a table.

<pre>
DELETE FROM <b><i>table_name</i></b>
    [ WHERE <b><i>predicate</i></b> ]
</pre>

Deletes rows where ***`predicate`*** evaluates to `TRUE`, or all rows if no `WHERE` clause is given.

* ***`table_name`***: the table to delete from. Errors if it does not exist.

* ***`predicate`***: an expression which determines which rows to delete by evaluting to `TRUE`. Must evaluate to a `BOOLEAN` or `NULL`, otherwise an error is returned.

#### Example

```sql
DELETE FROM movie
WHERE release_year < 2000 AND bluray = FALSE
```

### `DROP TABLE`

Deletes a table and all contained data. Errors if the table does not
exist, unless `IF EXISTS` is given.

<pre>
DROP TABLE [ IF EXISTS ] <b><i>table_name</i></b>
</pre>

* ***`table_name`***: the table to delete.

### `EXPLAIN`

Outputs the execution plan for the given statement.

<pre>
EXPLAIN [ <b><i>statement</i></b> ]
</pre>

### `INSERT`

Inserts rows into a table.

<pre>
INSERT INTO <b><i>table_name</i></b>
    [ ( <b><i>column_name</i></b> [, ... ] ) ]
    VALUES ( <b><i>expression</i></b> [, ... ] ) [, ... ]
</pre>

If column names are given, an identical number of values must be given. If no column names are given, values must be given in the table's column order. Omitted columns will get a default value if specified, otherwise an error will be returned.

* ***`table_name`***: the table to insert into. Errors if it does not exist.

* ***`column_name`***: a column to insert into in the given table. Errors if it does not exist.

* ***`expression`***: an expression to insert into the corresponding column. Must be a constant expression, i.e. it cannot refer to table columns.

#### Example

```sql
INSERT INTO movie
    (id, title, release_year)
VALUES
    (1, 'Sicario', 2015),
    (2, 'Stalker', 1979),
    (3, 'Her', 2013)
```

### `ROLLBACK`

Rolls back an active [transaction](#transactions).

### `SELECT`

Selects rows from a table.

<pre>
SELECT [ * | <b><i>expression</i></b> [ [ AS ] <b><i>output_name</i></b> [, ...] ] ]
    [ FROM <b><i>from_item</i></b> [, ...] ]
    [ WHERE <b><i>predicate</i></b> ]
    [ GROUP BY <b><i>group_expr</i></b> [, ...] ]
    [ HAVING <b><i>having_expr</i></b> ]
    [ ORDER BY <b><i>order_expr</i></b> [ ASC | DESC ] [, ...] ]
    [ LIMIT <b><i>count</i></b> ]
    [ OFFSET <b><i>start</i></b> ]

where <b><i>from_item</i></b> is one of:

<b><i>table_name</i></b> [ [ AS ] <b><i>alias</i></b> ]
<b><i>from_item</i></b> <b><i>join_type</i></b> <b><i>from_item</i></b> [ ON <b><i>join_predicate</i></b> ]

where <b><i>join_type</i></b> is one of:

CROSS JOIN
[ INNER ] JOIN
LEFT [ OUTER ] JOIN
RIGHT [ OUTER ] JOIN

</pre>

Fetches rows or expressions, either from table ***`table_name`*** (if given) or generated.

* ***`expression`***: [expression](#expressions) to fetch (can be a simple column name).

* ***`output_name`***: output column [identifier](#identifier), defaults to column name (if single column) otherwise nothing (displayed as `?`).

* ***`table_name`***: table to fetch rows from.

* ***`alias`***: table alias.

* ***`predicate`***: only return rows for which this [expression](#expressions) evaluates to `TRUE`.

* ***`group_expr`***: an expression to group aggregates by. Non-aggregate `SELECT` expressions must either reference a column given in `group_expr`, be idential with a `group_expr`, or have an `output_name` that is referenced by a `group_expr` column.

* ***`having_expr`***: only return aggregate results for which this [expression](#expressions) evaluates to `TRUE`.

* ***`order_expr`***: order rows by this expression (can be a simple column name).

* ***`count`***: maximum number of rows to return. Must be a constant integer expression.

* ***`start`***: number of rows to skip. Must be a constant integer expression.

* ***`join_predicate`***: only return rows for which this [expression](#expressions) evaluates to `TRUE`.

Join types:

* `CROSS JOIN`: returns the Carthesian product of the joined tables. Does not accept a join predicate (`ON` clause).

* `INNER JOIN`: returns the rows of the tables' Carthesian product for which  ***`join_predicate`*** evaluates to `TRUE`.

* `LEFT OUTER JOIN`: returns the rows joined on the ***`join_predicate`***, or for any rows in the left table that does not have a match in the right table a single row is returned with the right table's columns set to `NULL`.

* `RIGHT OUTER JOIN`: the same as a `LEFT OUTER JOIN` but with the left and right tables switched.

#### Example

```sql
SELECT id, title, 2020 - released AS age
FROM movies
WHERE released >= 2000 AND ultrahd
ORDER BY released DESC, title ASC
LIMIT 10
OFFSET 10
```

### `UPDATE`

Updates rows in a table.

<pre>
UPDATE <b><i>table_name</i></b>
    SET <b><i>column_name</i></b> = <b><i>expression</i></b> | DEFAULT [, ... ]
    [ WHERE <b><i>predicate</i></b> ]
</pre>

Updates columns given by ***`column_name`*** to the corresponding ***`expression`*** for all rows where ***`predicate`*** evaluates to `TRUE`. If no `WHERE` clause is given, all rows are updated.

* ***`table_name`***: the table to update. Errors if it does not exist.

* ***`column_name`***: a column to update. Errors if it does not exist.

* ***`expression`***: an expression whose evaluated value will be set for the corresponding column and row. Expressions can refer to column values, and must evaluate to the same datatype as the updated column. Using `DEFAULT` will set the column's default value, if any.

* ***`predicate`***: an expression which determines which rows to update by evaluting to `TRUE`. Must evaluate to a `BOOLEAN` or `NULL`, otherwise an error is returned.

#### Example

```sql
UPDATE movie
SET bluray = TRUE
WHERE release_year >= 2000 AND bluray = FALSE
```

## Transactions

toyDB supports ACID transactions using MVCC-based snapshot isolation, protecting from the following anomalies: dirty writes, dirty reads, lost updates, fuzzy reads, read skew, and phantom reads. However, write skew anomalies are possible since serializable snapshot isolation is not implemented.

A new transaction is started with `BEGIN`, and ended with either `COMMIT` (atomically writing all changes) or `ROLLBACK` (discarding all changes). If any conflicts occur between concurrent transactions, the lowest transaction ID wins and the others will fail with a serialization error and must retry.

All past data is versioned and retained, and can be queried as of a given transaction ID via `BEGIN TRANSACTION READ ONLY AS OF SYSTEM TIME <txn_id>`.

A transaction is still valid for use if a contained statement returns an error. It is up to the client to take appropriate action.


================================================
FILE: docs/tools/update-links.py
================================================
#!/usr/bin/env python3
#
# Updates GitHub code links to the latest commit SHA.

import os, re, sys, argparse
import requests

GITHUB_API = "https://api.github.com"

def get_latest_sha(owner, repo, path, token):
    url = f"{GITHUB_API}/repos/{owner}/{repo}/commits"
    headers = {}
    if token:
        headers["Authorization"] = f"token {token}"
    params = {"path": path, "sha": "main", "per_page": 1}
    resp = requests.get(url, headers=headers, params=params)
    resp.raise_for_status()
    data = resp.json()
    return data[0]["sha"] if data else None

def process_markdown(text, token):
    pattern = re.compile(
        r"https://github\.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)/blob/"
        r"(?P<oldsha>[0-9a-f]{7,40})/(?P<path>[^#)\s]+)"
    )
    cache = {}
    def replacer(m):
        print(f"Checking {m.group(0)}")
        owner, repo, oldsha, path = m.group("owner","repo","oldsha","path")
        key = (owner, repo, path)
        print(f"Key: {key}")
        if key not in cache:
            cache[key] = get_latest_sha(owner, repo, path, token)
        newsha = cache[key]
        if newsha and newsha != oldsha:
            print(f"Updating {m.group(0)} to {newsha}")
            return m.group(0).replace(oldsha, newsha)
        return m.group(0)
    return pattern.sub(replacer, text)

def main():
    p = argparse.ArgumentParser(description="Update GitHub blob links to latest SHAs")
    p.add_argument("file", nargs="?", help="Markdown file to update (defaults to stdin/stdout)")
    args = p.parse_args()
    token = os.getenv("GITHUB_TOKEN")
    if args.file:
        text = open(args.file, encoding="utf-8").read()
        updated = process_markdown(text, token)
        with open(args.file, "w", encoding="utf-8") as f:
            f.write(updated)
    else:
        text = sys.stdin.read()
        sys.stdout.write(process_markdown(text, token))

if __name__ == "__main__":
    main()


================================================
FILE: rust-toolchain
================================================
1.93.1

================================================
FILE: rustfmt.toml
================================================
use_small_heuristics = "Max"


================================================
FILE: src/bin/toydb.rs
================================================
//! The toyDB server. Takes configuration from a config file (default
//! config/toydb.yaml) or corresponding TOYDB_ environment variables. Listens
//! for SQL clients (default port 9601) and Raft connections from other toyDB
//! peers (default port 9701). The Raft log and SQL database are stored at
//! data/raft and data/sql by default.
//!
//! Use the toysql command-line client to connect to the server.

#![warn(clippy::all)]

use std::collections::HashMap;
use std::path::Path;

use clap::Parser as _;
use serde::Deserialize;

use toydb::Server;
use toydb::errinput;
use toydb::error::Result;
use toydb::raft;
use toydb::sql;
use toydb::storage;

fn main() {
    if let Err(error) = Command::parse().run() {
        eprintln!("Error: {error}")
    }
}

/// The toyDB server configuration. Can be provided via config file (default
/// config/toydb.yaml) or TOYDB_ environment variables.
#[derive(Debug, Deserialize)]
struct Config {
    /// The node ID. Must be unique in the cluster.
    id: raft::NodeID,
    /// The other nodes in the cluster, and their Raft TCP addresses.
    peers: HashMap<raft::NodeID, String>,
    /// The Raft listen address.
    listen_raft: String,
    /// The SQL listen address.
    listen_sql: String,
    /// The log level.
    log_level: String,
    /// The path to this node's data directory. The Raft log is stored in
    /// the file "raft", and the SQL state machine in "sql".
    data_dir: String,
    /// The Raft storage engine: bitcask or memory.
    storage_raft: String,
    /// The SQL storage engine: bitcask or memory.
    storage_sql: String,
    /// If false, don't fsync Raft log writes to disk. Disabling this
    /// will yield much better write performance, but may lose data on
    /// host crashes which compromises Raft safety guarantees.
    fsync: bool,
    /// The garbage fraction threshold at which to trigger compaction.
    compact_threshold: f64,
    /// The minimum bytes of garbage before triggering compaction.
    compact_min_bytes: u64,
}

impl Config {
    /// Loads the configuration from the given file.
    fn load(file: &str) -> Result<Self> {
        Ok(config::Config::builder()
            .set_default("id", "1")?
            .set_default("listen_sql", "localhost:9601")?
            .set_default("listen_raft", "localhost:9701")?
            .set_default("log_level", "info")?
            .set_default("data_dir", "data")?
            .set_default("storage_raft", "bitcask")?
            .set_default("storage_sql", "bitcask")?
            .set_default("fsync", true)?
            .set_default("compact_threshold", 0.2)?
            .set_default("compact_min_bytes", 1_000_000)?
            .add_source(config::File::with_name(file))
            .add_source(config::Environment::with_prefix("TOYDB"))
            .build()?
            .try_deserialize()?)
    }
}

/// The toyDB server command.
#[derive(clap::Parser)]
#[command(about = "Starts a toyDB server.", version, propagate_version = true)]
struct Command {
    /// The configuration file path.
    #[arg(short = 'c', long, default_value = "config/toydb.yaml")]
    config: String,
}

impl Command {
    /// Runs the toyDB server.
    fn run(self) -> Result<()> {
        // Load the configuration.
        let cfg = Config::load(&self.config)?;

        // Initialize logging.
        let loglevel = cfg.log_level.parse()?;
        let mut logconfig = simplelog::ConfigBuilder::new();
        if loglevel != simplelog::LevelFilter::Debug {
            logconfig.add_filter_allow_str("toydb");
        }
        simplelog::SimpleLogger::init(loglevel, logconfig.build())?;

        // Initialize the Raft log storage engine.
        let datadir = Path::new(&cfg.data_dir);
        let mut raft_log = match cfg.storage_raft.as_str() {
            "bitcask" | "" => {
                let engine = storage::BitCask::new_maybe_compact(
                    datadir.join("raft"),
                    cfg.compact_threshold,
                    cfg.compact_min_bytes,
                )?;
                raft::Log::new(Box::new(engine))?
            }
            "memory" => raft::Log::new(Box::new(storage::Memory::new()))?,
            name => return errinput!("invalid Raft storage engine {name}"),
        };
        raft_log.enable_fsync(cfg.fsync);

        // Initialize the SQL storage engine.
        let raft_state: Box<dyn raft::State> = match cfg.storage_sql.as_str() {
            "bitcask" | "" => {
                let engine = storage::BitCask::new_maybe_compact(
                    datadir.join("sql"),
                    cfg.compact_threshold,
                    cfg.compact_min_bytes,
                )?;
                Box::new(sql::engine::Raft::new_state(engine)?)
            }
            "memory" => Box::new(sql::engine::Raft::new_state(storage::Memory::new())?),
            name => return errinput!("invalid SQL storage engine {name}"),
        };

        // Start the server.
        Server::new(cfg.id, cfg.peers, raft_log, raft_state)?
            .serve(&cfg.listen_raft, &cfg.listen_sql)
    }
}


================================================
FILE: src/bin/toydump.rs
================================================
//! toydump is a debug tool that prints a toyDB BitCask database in
//! human-readable form. It can print both the SQL database and the Raft log
//! (via --raft). It only outputs live BitCask data, not garbage entries.

#![warn(clippy::all)]

use clap::Parser as _;

use toydb::encoding::format::{self, Formatter as _};
use toydb::error::Result;
use toydb::storage::{BitCask, Engine as _};

fn main() {
    if let Err(error) = Command::parse().run() {
        eprintln!("Error: {error}")
    }
}

/// The toydump command.
#[derive(clap::Parser)]
#[command(about = "Prints toyDB file contents.", version, propagate_version = true)]
struct Command {
    /// The BitCask file to dump (SQL database unless --raft).
    file: String,
    /// The file is a Raft log, not SQL database.
    #[arg(long)]
    raft: bool,
    /// Also show raw key and value.
    #[arg(long)]
    raw: bool,
}

impl Command {
    /// Runs the command.
    fn run(self) -> Result<()> {
        let mut engine = BitCask::new(self.file.into())?;
        let mut scan = engine.scan(..);
        while let Some((key, value)) = scan.next().transpose()? {
            let mut string = match self.raft {
                true => format::Raft::<format::SQLCommand>::key_value(&key, &value),
                false => format::MVCC::<format::SQL>::key_value(&key, &value),
            };
            if self.raw {
                string = format!("{string} [{}]", format::Raw::key_value(&key, &value))
            }
            println!("{string}");
        }
        Ok(())
    }
}


================================================
FILE: src/bin/toysql.rs
================================================
//! toySQL is a command-line client for toyDB. It connects to a toyDB node
//! (default localhost:9601) and executes SQL statements against it via an
//! interactive shell interface. Command history is stored in .toysql.history.

#![warn(clippy::all)]

use std::path::PathBuf;

use clap::Parser as _;
use itertools::Itertools as _;
use rustyline::error::ReadlineError;
use rustyline::history::DefaultHistory;
use rustyline::validate::{ValidationContext, ValidationResult, Validator};
use rustyline::{Editor, Modifiers};
use rustyline_derive::{Completer, Helper, Highlighter, Hinter};

use toydb::Client;
use toydb::errinput;
use toydb::error::Result;
use toydb::sql::execution::StatementResult;
use toydb::sql::parser::{Lexer, Token};

fn main() {
    if let Err(error) = Command::parse().run() {
        eprintln!("Error: {error}");
    }
}

/// The toySQL command.
#[derive(clap::Parser)]
#[command(about = "A toyDB client.", version, propagate_version = true)]
struct Command {
    /// A SQL statement to execute, then exit.
    #[arg()]
    statement: Option<String>,
    /// Host to connect to.
    #[arg(short = 'H', long, default_value = "localhost")]
    host: String,
    /// Port number to connect to.
    #[arg(short = 'p', long, default_value = "9601")]
    port: u16,
}

impl Command {
    /// Runs the command.
    fn run(self) -> Result<()> {
        let mut shell = Shell::new(&self.host, self.port)?;
        match self.statement {
            Some(statement) => shell.execute(&statement),
            None => shell.run(),
        }
    }
}

/// An interactive toySQL shell.
struct Shell {
    /// The toyDB client.
    client: Client,
    /// The Rustyline command editor.
    editor: Editor<InputValidator, DefaultHistory>,
    /// The path to the history file, if any.
    history_path: Option<PathBuf>,
    /// If true, SELECT column headers will be displayed.
    show_headers: bool,
}

impl Shell {
    /// Creates a new shell connected to the given server.
    fn new(host: &str, port: u16) -> Result<Self> {
        let client = Client::connect((host, port))?;
        // Set up Rustyline. Make sure multiline pastes are handled normally.
        let mut editor = Editor::new()?;
        editor.set_helper(Some(InputValidator));
        editor.bind_sequence(
            rustyline::KeyEvent(rustyline::KeyCode::BracketedPasteStart, Modifiers::NONE),
            rustyline::Cmd::Noop,
        );
        let history_path =
            std::env::var_os("HOME").map(|home| PathBuf::from(home).join(".toysql.history"));
        Ok(Self { client, editor, history_path, show_headers: false })
    }

    /// Executes a SQL statement or ! command.
    fn execute(&mut self, input: &str) -> Result<()> {
        if input.starts_with('!') {
            self.execute_command(input)
        } else if !input.is_empty() {
            self.execute_sql(input)
        } else {
            Ok(())
        }
    }

    /// Executes a toySQL ! command (e.g. !help)
    fn execute_command(&mut self, input: &str) -> Result<()> {
        let mut input = input.split_ascii_whitespace();
        let Some(command) = input.next() else {
            return errinput!("expected command");
        };
        let args = input.collect_vec();

        match (command, args.as_slice()) {
            // Toggles column headers.
            ("!headers", []) => {
                self.show_headers = !self.show_headers;
                match self.show_headers {
                    true => println!("Headers enabled"),
                    false => println!("Headers disabled"),
                }
            }
            ("!headers", _) => return errinput!("!headers takes no arguments"),

            // Displays help.
            ("!help", []) => println!(
                r#"
Enter a SQL statement terminated by a semicolon (;) to execute it, or Ctrl-D to
exit. The following commands are also available:

    !headers           Toggles column headers
    !help              This help message
    !status            Display server status
    !table NAME        Display a table schema
    !tables            List tables
"#
            ),
            ("!help", _) => return errinput!("!help takes no arguments"),

            // Displays server status.
            ("!status", []) => {
                let status = self.client.status()?;
                println!(
                    r#"
Server:       n{server} with Raft leader n{leader} in term {term} for {nodes} nodes
Raft log:     {committed} committed, {applied} applied, {raft_size} MB, {raft_garbage}% garbage ({raft_storage} engine)
Replication:  {raft_match}
SQL storage:  {sql_keys} keys, {sql_size} MB logical, {nodes}x {sql_disk_size} MB disk, {sql_garbage}% garbage ({sql_storage} engine)
Transactions: {active_txns} active, {versions} total
"#,
                    server = status.server,
                    leader = status.raft.leader,
                    term = status.raft.term,
                    nodes = status.raft.match_index.len(),
                    committed = status.raft.commit_index,
                    applied = status.raft.applied_index,
                    raft_size =
                        format_args!("{:.3}", status.raft.storage.size as f64 / 1_000_000.0),
                    raft_garbage =
                        format_args!("{:.0}", status.raft.storage.garbage_disk_percent()),
                    raft_storage = status.raft.storage.name,
                    raft_match =
                        status.raft.match_index.iter().map(|(n, m)| format!("n{n}:{m}")).join(" "),
                    sql_keys = status.mvcc.storage.keys,
                    sql_size = format_args!("{:.3}", status.mvcc.storage.size as f64 / 1_000_000.0),
                    sql_disk_size =
                        format_args!("{:.3}", status.mvcc.storage.disk_size as f64 / 1_000_000.0),
                    sql_garbage = format_args!("{:.0}", status.mvcc.storage.garbage_disk_percent()),
                    sql_storage = status.mvcc.storage.name,
                    active_txns = status.mvcc.active_txns,
                    versions = status.mvcc.versions,
                )
            }
            ("!status", _) => return errinput!("!status takes no arguments"),

            ("!table", [name]) => println!("{}", self.client.get_table(name)?),
            ("!table", _) => return errinput!("!table takes 1 argument"),

            ("!tables", []) => self.client.list_tables()?.iter().for_each(|t| println!("{t}")),
            ("!tables", _) => return errinput!("!tables takes no arguments"),

            (command, _) => return errinput!("unknown command {command}"),
        }
        Ok(())
    }

    /// Executes a SQL statement and displays the results.
    fn execute_sql(&mut self, statement: &str) -> Result<()> {
        use StatementResult::*;
        match self.client.execute(statement)? {
            Begin(state) => match state.read_only {
                true => println!("Began read-only transaction at version {}", state.version),
                false => println!("Began transaction {}", state.version),
            },
            Commit { version } => println!("Committed transaction {version}"),
            Rollback { version } => println!("Rolled back transaction {version}"),
            Insert { count } => println!("Inserted {count} rows"),
            Delete { count } => println!("Deleted {count} rows"),
            Update { count } => println!("Updated {count} rows"),
            CreateTable { name } => println!("Created table {name}"),
            DropTable { name, existed } => match existed {
                true => println!("Dropped table {name}"),
                false => println!("Table {name} does not exist"),
            },
            Explain(plan) => println!("{plan}"),
            Select { columns, rows } => {
                if self.show_headers {
                    println!("{}", columns.iter().map(|c| c.as_header()).join(", "));
                }
                for row in rows {
                    println!("{}", row.iter().join(", "));
                }
            }
        }
        Ok(())
    }

    /// Prompts the user for input. Returns None if the shell should close.
    fn prompt(&mut self) -> rustyline::Result<String> {
        let prompt = match self.client.txn() {
            Some(txn) if txn.read_only => format!("toydb@{}> ", txn.version),
            Some(txn) => format!("toydb:{}> ", txn.version),
            None => "toydb> ".to_string(),
        };
        self.editor.readline(&prompt)
    }

    /// Runs the interactive shell.
    fn run(&mut self) -> Result<()> {
        // Load the history file, if any.
        if let Some(history_path) = &self.history_path {
            match self.editor.load_history(history_path) {
                Ok(()) => {}
                Err(ReadlineError::Io(error)) if error.kind() == std::io::ErrorKind::NotFound => {}
                Err(error) => return Err(error.into()),
            }
        }

        // Print welcome message.
        let server = self.client.status()?.server;
        println!("Connected to toyDB node n{server}. Enter !help for instructions.");

        // Prompt for commands and execute them.
        loop {
            let input = match self.prompt() {
                Ok(input) => input.trim().to_string(),
                Err(ReadlineError::Interrupted) => continue,
                Err(ReadlineError::Eof) => break,
                Err(error) => return Err(error.into()),
            };
            self.editor.add_history_entry(&input)?;
            if let Err(error) = self.execute(&input) {
                eprintln!("Error: {error}");
            };
        }

        // Save the history file.
        if let Some(history_path) = &self.history_path {
            self.editor.save_history(history_path)?;
        }
        Ok(())
    }
}

/// A Rustyline helper for multiline editing. After a new line is entered, it
/// determines whether the input makes up a complete SQL statement that should
/// be submitted to the server (i.e. it's terminated by ;), or wait for further
/// input.
#[derive(Completer, Helper, Highlighter, Hinter)]
struct InputValidator;

impl Validator for InputValidator {
    fn validate(&self, ctx: &mut ValidationContext) -> rustyline::Result<ValidationResult> {
        let input = ctx.input();
        // Empty lines and ! commands are ready.
        if input.is_empty() || input.starts_with('!') || input == ";" {
            return Ok(ValidationResult::Valid(None));
        }
        // For SQL statements, just look for any semicolon or lexer error, and
        // rely on the server for further validation and error handling.
        if Lexer::new(input).any(|r| matches!(r, Ok(Token::Semicolon) | Err(_))) {
            return Ok(ValidationResult::Valid(None));
        }
        // Otherwise, wait for more input.
        Ok(ValidationResult::Incomplete)
    }

    fn validate_while_typing(&self) -> bool {
        false // only check after completed lines
    }
}


================================================
FILE: src/bin/workload.rs
================================================
//! Runs toyDB workload benchmarks. By default, it assumes a running 5-node
//! cluster as launched via cluster/run.sh, but this can be modified via -H.
//! For example, a read-only workload can be run as:
//!
//! cargo run --release --bin workload -- read
//!
//! See --help for a list of available workloads and arguments.

#![warn(clippy::all)]

use std::cmp::min;
use std::collections::HashSet;
use std::io::Write as _;
use std::time::{Duration, Instant};

use clap::Parser;
use hdrhistogram::Histogram;
use itertools::Itertools as _;
use rand::SeedableRng as _;
use rand::distr::Distribution as _;
use rand::rngs::StdRng;
use rand::seq::IndexedRandom as _;

use toydb::error::Result;
use toydb::sql::types::{Row, Rows};
use toydb::{Client, StatementResult};

fn main() {
    let Command { runner, subcommand } = Command::parse();
    let result = match subcommand {
        Subcommand::Read(read) => runner.run(read),
        Subcommand::Write(write) => runner.run(write),
        Subcommand::Bank(bank) => runner.run(bank),
    };
    if let Err(error) = result {
        eprintln!("Error: {error}")
    }
}

/// Handles command-line parsing.
#[derive(clap::Parser)]
#[command(about = "Runs toyDB workload benchmarks.", version, propagate_version = true)]
struct Command {
    #[command(flatten)]
    runner: Runner,

    #[command(subcommand)]
    subcommand: Subcommand,
}

#[derive(clap::Subcommand)]
enum Subcommand {
    Read(Read),
    Write(Write),
    Bank(Bank),
}

/// Runs a workload benchmark.
#[derive(clap::Args)]
struct Runner {
    /// Hosts to connect to (optionally with port number).
    #[arg(
        short = 'H',
        long,
        value_delimiter = ',',
        default_value = "localhost:9601,localhost:9602,localhost:9603,localhost:9604,localhost:9605"
    )]
    hosts: Vec<String>,

    /// Number of concurrent workers to spawn.
    #[arg(short, long, default_value = "16")]
    concurrency: usize,

    /// Number of transactions to execute.
    #[arg(short = 'n', long, default_value = "100000")]
    count: usize,

    /// Seed to use for random number generation.
    #[arg(short, long, default_value = "16791084677885396490")]
    seed: u64,
}

impl Runner {
    /// Runs the specified workload.
    fn run<W: Workload>(self, workload: W) -> Result<()> {
        let mut rng = StdRng::seed_from_u64(self.seed);
        let mut client = Client::connect(&self.hosts[0])?;

        // Set up a histogram recording txn latencies as nanoseconds. The
        // buckets range from 0.001s to 10s.
        let mut hist = Histogram::<u32>::new_with_bounds(1_000, 10_000_000_000, 3)?.into_sync();

        // Prepare the dataset.
        print!("Preparing initial dataset... ");
        std::io::stdout().flush()?;
        let start = Instant::now();
        workload.prepare(&mut client, &mut rng)?;
        println!("done ({:.3}s)", start.elapsed().as_secs_f64());

        // Spawn workers, round robin across hosts.
        std::thread::scope(|s| -> Result<()> {
            print!("Spawning {} workers... ", self.concurrency);
            std::io::stdout().flush()?;
            let start = Instant::now();

            let (work_tx, work_rx) = crossbeam::channel::bounded(self.concurrency);
            let (done_tx, done_rx) = crossbeam::channel::bounded::<()>(0);

            for addr in self.hosts.iter().cycle().take(self.concurrency) {
                let mut client = Client::connect(addr)?;
                let mut recorder = hist.recorder();
                let work_rx = work_rx.clone();
                let done_tx = done_tx.clone();
                s.spawn(move || -> Result<()> {
                    while let Ok(item) = work_rx.recv() {
                        let start = Instant::now();
                        client.with_retry(|client| W::execute(client, &item))?;
                        recorder.record(start.elapsed().as_nanos() as u64)?;
                    }
                    drop(done_tx); // disconnects done_rx once all workers exit
                    Ok(())
                });
            }
            drop(done_tx); // drop local copy

            println!("done ({:.3}s)", start.elapsed().as_secs_f64());

            // Spawn work generator.
            {
                println!("Running workload {}...", workload);
                let generator = workload.generate(rng)?.take(self.count);
                s.spawn(move || -> Result<()> {
                    for item in generator {
                        work_tx.send(item)?;
                    }
                    Ok(())
                });
            }

            // Periodically print stats until all workers are done.
            let start = Instant::now();
            let ticker = crossbeam::channel::tick(Duration::from_secs(1));

            println!();
            println!("Time   Progress     Txns      Rate       p50       p90       p99      pMax");

            while let Err(crossbeam::channel::TryRecvError::Empty) = done_rx.try_recv() {
                crossbeam::select! {
                    recv(ticker) -> _ => {},
                    recv(done_rx) -> _ => {},
                }

                let duration = start.elapsed().as_secs_f64();
                hist.refresh_timeout(Duration::from_secs(1));

                println!(
                    "{:<8} {:>5.1}%  {:>7}  {:>6.0}/s  {:>6.1}ms  {:>6.1}ms  {:>6.1}ms  {:>6.1}ms",
                    format!("{:.1}s", duration),
                    hist.len() as f64 / self.count as f64 * 100.0,
                    hist.len(),
                    hist.len() as f64 / duration,
                    Duration::from_nanos(hist.value_at_quantile(0.5)).as_secs_f64() * 1000.0,
                    Duration::from_nanos(hist.value_at_quantile(0.9)).as_secs_f64() * 1000.0,
                    Duration::from_nanos(hist.value_at_quantile(0.99)).as_secs_f64() * 1000.0,
                    Duration::from_nanos(hist.max()).as_secs_f64() * 1000.0,
                );
            }
            Ok(())
        })?;

        // Verify the final dataset.
        println!();
        print!("Verifying dataset... ");
        std::io::stdout().flush()?;
        let start = Instant::now();
        workload.verify(&mut client, self.count)?;
        println!("done ({:.3}s)", start.elapsed().as_secs_f64());

        Ok(())
    }
}

/// A workload.
trait Workload: std::fmt::Display {
    /// A work item.
    type Item: Send;

    /// Prepares the workload by creating initial tables and data.
    fn prepare(&self, client: &mut Client, rng: &mut StdRng) -> Result<()>;

    /// Generates work items as an iterator.
    fn generate(&self, rng: StdRng) -> Result<impl Iterator<Item = Self::Item> + Send + 'static>;

    /// Executes a single work item. This will automatically be retried on
    /// certain errors, and must use a transaction where appropriate.
    fn execute(client: &mut Client, item: &Self::Item) -> Result<()>;

    /// Verifies the dataset after the workload has completed.
    fn verify(&self, _client: &mut Client, _txns: usize) -> Result<()> {
        Ok(())
    }
}

/// A read-only workload. Creates an id,value table and populates it with the
/// given row count and value size. Then runs batches of random primary key
/// lookups (SELECT * FROM read WHERE id = 1 OR id = 2 ...).
#[derive(clap::Args, Clone)]
#[command(about = "A read-only workload using primary key lookups")]
struct Read {
    /// Total number of rows in data set.
    #[arg(short, long, default_value = "1000")]
    rows: u64,

    /// Row value size (excluding primary key).
    #[arg(short, long, default_value = "64")]
    size: usize,

    /// Number of rows to fetch in a single select.
    #[arg(short, long, default_value = "1")]
    batch: usize,
}

impl std::fmt::Display for Read {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "read (rows={} size={} batch={})", self.rows, self.size, self.batch)
    }
}

impl Workload for Read {
    type Item = HashSet<u64>;

    fn prepare(&self, client: &mut Client, rng: &mut StdRng) -> Result<()> {
        client.execute("BEGIN")?;
        client.execute(r#"DROP TABLE IF EXISTS "read""#)?;
        client.execute(r#"CREATE TABLE "read" (id INT PRIMARY KEY, value STRING NOT NULL)"#)?;

        let chars = &mut rand::distr::Alphanumeric.sample_iter(rng).map(|b| b as char);
        let rows = (1..=self.rows).map(|id| (id, chars.take(self.size).collect::<String>()));
        let chunks = rows.chunks(100);
        let queries = chunks.into_iter().map(|chunk| {
            format!(
                r#"INSERT INTO "read" (id, value) VALUES ({})"#,
                chunk.map(|(id, value)| format!("{}, '{}'", id, value)).join("), (")
            )
        });
        for query in queries {
            client.execute(&query)?;
        }
        client.execute("COMMIT")?;
        Ok(())
    }

    fn generate(&self, rng: StdRng) -> Result<impl Iterator<Item = Self::Item> + 'static> {
        Ok(ReadGenerator {
            batch: self.batch,
            dist: rand::distr::Uniform::new(1, self.rows + 1)?,
            rng,
        })
    }

    fn execute(client: &mut Client, item: &Self::Item) -> Result<()> {
        let batch_size = item.len();
        let query = format!(
            r#"SELECT * FROM "read" WHERE {}"#,
            item.iter().map(|id| format!("id = {}", id)).join(" OR ")
        );
        let rows: Rows = client.execute(&query)?.try_into()?;
        assert_eq!(rows.count(), batch_size, "Unexpected row count");
        Ok(())
    }

    fn verify(&self, client: &mut Client, _: usize) -> Result<()> {
        let count: i64 = client.execute(r#"SELECT COUNT(*) FROM "read""#)?.try_into()?;
        assert_eq!(count, self.rows as i64, "Unexpected row count");
        Ok(())
    }
}

/// A Read workload generator, yielding batches of random, unique primary keys.
struct ReadGenerator {
    batch: usize,
    rng: StdRng,
    dist: rand::distr::Uniform<u64>,
}

impl Iterator for ReadGenerator {
    type Item = <Read as Workload>::Item;

    fn next(&mut self) -> Option<Self::Item> {
        let mut ids = HashSet::new();
        for id in self.dist.sample_iter(&mut self.rng) {
            ids.insert(id);
            if ids.len() >= self.batch {
                break;
            }
        }
        Some(ids)
    }
}

/// A write-only workload. Creates an id,value table, and writes rows with
/// sequential primary keys and the given value size, in the given batch size
/// (INSERT INTO write (id, value) VALUES ...). The number of rows written
/// is given by Runner.count * Write.batch.
#[derive(clap::Args, Clone)]
#[command(about = "A write-only workload writing sequential rows")]
struct Write {
    /// Row value size (excluding primary key).
    #[arg(short, long, default_value = "64")]
    size: usize,

    /// Number of rows to write in a single insert query.
    #[arg(short, long, default_value = "1")]
    batch: usize,
}

impl std::fmt::Display for Write {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "write (size={} batch={})", self.size, self.batch)
    }
}

impl Workload for Write {
    type Item = Vec<(u64, String)>;

    fn prepare(&self, client: &mut Client, _: &mut StdRng) -> Result<()> {
        client.execute("BEGIN")?;
        client.execute(r#"DROP TABLE IF EXISTS "write""#)?;
        client.execute(r#"CREATE TABLE "write" (id INT PRIMARY KEY, value STRING NOT NULL)"#)?;
        client.execute("COMMIT")?;
        Ok(())
    }

    fn generate(&self, rng: StdRng) -> Result<impl Iterator<Item = Self::Item> + 'static> {
        Ok(WriteGenerator { next_id: 1, size: self.size, batch: self.batch, rng })
    }

    fn execute(client: &mut Client, item: &Self::Item) -> Result<()> {
        let batch_size = item.len();
        let query = format!(
            r#"INSERT INTO "write" (id, value) VALUES {}"#,
            item.iter().map(|(id, value)| format!("({}, '{}')", id, value)).join(", ")
        );
        if let StatementResult::Insert { count } = client.execute(&query)? {
            assert_eq!(count as usize, batch_size, "Unexpected row count");
        } else {
            panic!("Unexpected result")
        }
        Ok(())
    }

    fn verify(&self, client: &mut Client, txns: usize) -> Result<()> {
        let count: i64 = client.execute(r#"SELECT COUNT(*) FROM "write""#)?.try_into()?;
        assert_eq!(count as usize, txns * self.batch, "Unexpected row count");
        Ok(())
    }
}

/// A Write workload generator, yielding batches of sequential primary keys and
/// random rows.
struct WriteGenerator {
    next_id: u64,
    size: usize,
    batch: usize,
    rng: StdRng,
}

impl Iterator for WriteGenerator {
    type Item = <Write as Workload>::Item;

    fn next(&mut self) -> Option<Self::Item> {
        let chars = &mut rand::distr::Alphanumeric.sample_iter(&mut self.rng).map(|b| b as char);
        let mut rows = Vec::with_capacity(self.batch);
        while rows.len() < self.batch {
            rows.push((self.next_id, chars.take(self.size).collect()));
            self.next_id += 1;
        }
        Some(rows)
    }
}

/// A bank workload. Creates a set of customers and accounts, and makes random
/// transfers between them. Specifically, it picks two random customers A and B,
/// and then finds A's highest-balance account and B's lowest-balance account,
/// and transfers a random amount without overdrawing the account. This
/// somewhat convoluted scheme is used to make the workload slightly less
/// trivial, including joins, ordering, and secondary indexes.
#[derive(clap::Args, Clone)]
#[command(about = "A bank workload, making transfers between customer accounts")]
struct Bank {
    /// Number of customers.
    #[arg(short, long, default_value = "100")]
    customers: u64,

    /// Number of accounts per customer.
    #[arg(short, long, default_value = "10")]
    accounts: u64,

    /// Initial account balance.
    #[arg(short, long, default_value = "100")]
    balance: u64,

    /// Max amount to transfer.
    #[arg(short, long, default_value = "50")]
    max_transfer: u64,
}

impl std::fmt::Display for Bank {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "bank (customers={} accounts={})", self.customers, self.accounts)
    }
}

impl Workload for Bank {
    type Item = (u64, u64, u64); // from,to,amount

    fn prepare(&self, client: &mut Client, rng: &mut StdRng) -> Result<()> {
        let petnames = petname::Petnames::default();
        client.execute("BEGIN")?;
        client.execute("DROP TABLE IF EXISTS account")?;
        client.execute("DROP TABLE IF EXISTS customer")?;
        client.execute(
            "CREATE TABLE customer (
                    id INTEGER PRIMARY KEY,
                    name STRING NOT NULL
                )",
        )?;
        client.execute(
            "CREATE TABLE account (
                    id INTEGER PRIMARY KEY,
                    customer_id INTEGER NOT NULL INDEX REFERENCES customer,
                    balance INTEGER NOT NULL
                )",
        )?;
        client.execute(&format!(
            "INSERT INTO customer VALUES {}",
            (1..=self.customers)
                .map(|id| {
                    let name = [
                        *petnames.adverbs.choose(rng).expect("no adverb"),
                        *petnames.adjectives.choose(rng).expect("no adjective"),
                        *petnames.nouns.choose(rng).expect("no noun"),
                    ]
                    .join(" ");
                    (id, name)
                })
                .map(|(id, name)| format!("({}, '{}')", id, name))
                .join(", ")
        ))?;
        client.execute(&format!(
            "INSERT INTO account VALUES {}",
            (1..=self.customers)
                .flat_map(|c| (1..=self.accounts).map(move |a| (c, (c - 1) * self.accounts + a)))
                .map(|(c, a)| format!("({}, {}, {})", a, c, self.balance))
                .join(", ")
        ))?;
        client.execute("COMMIT")?;
        Ok(())
    }

    fn generate(&self, rng: StdRng) -> Result<impl Iterator<Item = Self::Item> + 'static> {
        let customers = self.customers;
        let max_transfer = self.max_transfer;
        // Generate random u64s, then pick random from,to,amount as the
        // remainder of the max customer and amount.
        Ok(rand::distr::Uniform::new_inclusive(0, u64::MAX)?
            .sample_iter(rng)
            .tuples()
            .map(move |(a, b, c)| (a % customers + 1, b % customers + 1, c % max_transfer + 1))
            .filter(|(from, to, _)| from != to))
    }

    fn execute(client: &mut Client, item: &Self::Item) -> Result<()> {
        let &(from, to, mut amount) = item;

        client.execute("BEGIN")?;

        let row: Row = client
            .execute(&format!(
                "SELECT a.id, a.balance
                        FROM account a JOIN customer c ON a.customer_id = c.id
                        WHERE c.id = {}
                        ORDER BY a.balance DESC
                        LIMIT 1",
                from
            ))?
            .try_into()?;
        let mut row = row.into_iter();
        let from_account: i64 = row.next().unwrap().try_into()?;
        let from_balance: i64 = row.next().unwrap().try_into()?;
        amount = min(amount, from_balance as u64);

        let to_account: i64 = client
            .execute(&format!(
                "SELECT a.id, a.balance
                        FROM account a JOIN customer c ON a.customer_id = c.id
                        WHERE c.id = {}
                        ORDER BY a.balance ASC
                        LIMIT 1",
                to
            ))?
            .try_into()?;

        client.execute(&format!(
            "UPDATE account SET balance = balance - {} WHERE id = {}",
            amount, from_account,
        ))?;
        client.execute(&format!(
            "UPDATE account SET balance = balance + {} WHERE id = {}",
            amount, to_account,
        ))?;

        client.execute("COMMIT")?;

        Ok(())
    }

    fn verify(&self, client: &mut Client, _: usize) -> Result<()> {
        let balance: i64 = client.execute("SELECT SUM(balance) FROM account")?.try_into()?;
        assert_eq!(balance as u64, self.customers * self.accounts * self.balance);
        let negative: i64 =
            client.execute("SELECT COUNT(*) FROM account WHERE balance < 0")?.try_into()?;
        assert_eq!(negative, 0);
        Ok(())
    }
}


================================================
FILE: src/client.rs
================================================
use std::io::{BufReader, BufWriter, Write as _};
use std::net::{TcpStream, ToSocketAddrs};
use std::time::Duration;

use rand::RngExt as _;

use crate::encoding::Value as _;
use crate::errdata;
use crate::error::{Error, Result};
use crate::server::{Request, Response, Status};
use crate::sql::execution::StatementResult;
use crate::sql::types::Table;
use crate::storage::mvcc;

/// A toyDB client. Connects to a server via TCP and submits SQL statements and
/// other requests.
pub struct Client {
    /// Inbound response stream.
    reader: BufReader<TcpStream>,
    /// Outbound request stream.
    writer: BufWriter<TcpStream>,
    /// The current transaction, if any.
    txn: Option<mvcc::TransactionState>,
}

impl Client {
    /// Connects to a toyDB server, creating a new client.
    pub fn connect(addr: impl ToSocketAddrs) -> Result<Self> {
        let socket = TcpStream::connect(addr)?;
        let reader = BufReader::new(socket.try_clone()?);
        let writer = BufWriter::new(socket);
        Ok(Self { reader, writer, txn: None })
    }

    /// Sends a request to the server, returning the response.
    fn request(&mut self, request: Request) -> Result<Response> {
        request.encode_into(&mut self.writer)?;
        self.writer.flush()?;
        Result::decode_from(&mut self.reader)?
    }

    /// Executes a SQL statement.
    pub fn execute(&mut self, statement: &str) -> Result<StatementResult> {
        let result = match self.request(Request::Execute(statement.to_string()))? {
            Response::Execute(result) => result,
            response => return errdata!("unexpected response {response:?}"),
        };
        // Update the transaction state.
        match &result {
            StatementResult::Begin(state) => self.txn = Some(state.clone()),
            StatementResult::Commit { .. } => self.txn = None,
            StatementResult::Rollback { .. } => self.txn = None,
            _ => {}
        }
        Ok(result)
    }

    /// Fetches a table schema.
    pub fn get_table(&mut self, table: &str) -> Result<Table> {
        match self.request(Request::GetTable(table.to_string()))? {
            Response::GetTable(table) => Ok(table),
            response => errdata!("unexpected response: {response:?}"),
        }
    }

    /// Lists database tables.
    pub fn list_tables(&mut self) -> Result<Vec<String>> {
        match self.request(Request::ListTables)? {
            Response::ListTables(tables) => Ok(tables),
            response => errdata!("unexpected response: {response:?}"),
        }
    }

    /// Returns server status.
    pub fn status(&mut self) -> Result<Status> {
        match self.request(Request::Status)? {
            Response::Status(status) => Ok(status),
            response => errdata!("unexpected response: {response:?}"),
        }
    }

    /// Returns the transaction state.
    pub fn txn(&self) -> Option<&mvcc::TransactionState> {
        self.txn.as_ref()
    }

    /// Runs the given closure, automatically retrying serialization and abort
    /// errors. If a transaction is open following an error, it is automatically
    /// rolled back. It is the caller's responsibility to use a transaction in
    /// the closure where appropriate (i.e. when it is not idempotent).
    pub fn with_retry<T>(&mut self, f: impl Fn(&mut Client) -> Result<T>) -> Result<T> {
        const MAX_RETRIES: u32 = 10;
        const MIN_WAIT: u64 = 10;
        const MAX_WAIT: u64 = 2_000;
        let mut retries: u32 = 0;
        loop {
            match f(self) {
                Ok(result) => return Ok(result),
                Err(Error::Serialization | Error::Abort) if retries < MAX_RETRIES => {
                    if self.txn().is_some() {
                        self.execute("ROLLBACK")?;
                    }
                    // Use exponential backoff starting at MIN_WAIT doubling up
                    // to MAX_WAIT, but randomize the wait time in this interval
                    // to reduce the chance of collisions.
                    let mut wait = MAX_WAIT.min(MIN_WAIT * 2_u64.pow(retries));
                    wait = rand::rng().random_range(MIN_WAIT..=wait);
                    std::thread::sleep(Duration::from_millis(wait));
                    retries += 1;
                }
                Err(error) => {
                    if self.txn().is_some() {
                        self.execute("ROLLBACK").ok(); // ignore rollback error
                    }
                    return Err(error);
                }
            }
        }
    }
}


================================================
FILE: src/encoding/bincode.rs
================================================
//! Bincode is used to encode values, both in key/value stores and the toyDB
//! network protocol. It is a Rust-specific encoding that depends on the
//! internal data structures being stable, but it's sufficient for toyDB. See:
//! <https://github.com/bincode-org/bincode>
//!
//! This module wraps the [`bincode`] crate and uses the standard config.

use std::io::{Read, Write};

use serde::de::DeserializeOwned;
use serde::{Deserialize, Serialize};

use crate::error::{Error, Result};

/// Use the standard Bincode configuration.
const CONFIG: bincode::config::Configuration = bincode::config::standard();

/// Serializes a value using Bincode.
pub fn serialize<T: Serialize>(value: &T) -> Vec<u8> {
    // Panic on failure, as this is a problem with the data structure.
    bincode::serde::encode_to_vec(value, CONFIG).expect("value must be serializable")
}

/// Deserializes a value using Bincode.
pub fn deserialize<'de, T: Deserialize<'de>>(bytes: &'de [u8]) -> Result<T> {
    Ok(bincode::serde::borrow_decode_from_slice(bytes, CONFIG)?.0)
}

/// Serializes a value to a writer using Bincode.
pub fn serialize_into<W: Write, T: Serialize>(mut writer: W, value: &T) -> Result<()> {
    bincode::serde::encode_into_std_write(value, &mut writer, CONFIG)?;
    Ok(())
}

/// Deserializes a value from a reader using Bincode.
pub fn deserialize_from<R: Read, T: DeserializeOwned>(mut reader: R) -> Result<T> {
    Ok(bincode::serde::decode_from_std_read(&mut reader, CONFIG)?)
}

/// Deserializes a value from a reader using Bincode, or returns None if the
/// reader is closed.
pub fn maybe_deserialize_from<R: Read, T: DeserializeOwned>(mut reader: R) -> Result<Option<T>> {
    match bincode::serde::decode_from_std_read(&mut reader, CONFIG) {
        Ok(t) => Ok(Some(t)),
        Err(bincode::error::DecodeError::Io { inner, .. })
            if inner.kind() == std::io::ErrorKind::UnexpectedEof
                || inner.kind() == std::io::ErrorKind::ConnectionReset =>
        {
            Ok(None)
        }
        Err(err) => Err(Error::from(err)),
    }
}


================================================
FILE: src/encoding/format.rs
================================================
//! Formats raw keys and values, recursively where necessary. Handles both both
//! Raft, MVCC, SQL, and raw binary data.

use std::collections::BTreeSet;
use std::marker::PhantomData;

use itertools::Itertools as _;
use regex::Regex;

use super::{Key as _, Value as _, bincode};
use crate::raft;
use crate::sql;
use crate::storage::mvcc;

/// Formats encoded keys and values.
pub trait Formatter {
    /// Formats a key.
    fn key(key: &[u8]) -> String;

    /// Formats a value. Also takes the key to determine the kind of value.
    fn value(key: &[u8], value: &[u8]) -> String;

    /// Formats a key/value pair.
    fn key_value(key: &[u8], value: &[u8]) -> String {
        Self::key_maybe_value(key, Some(value))
    }

    /// Formats a key/value pair, where the value may not exist.
    fn key_maybe_value(key: &[u8], value: Option<&[u8]>) -> String {
        let fmtkey = Self::key(key);
        let fmtvalue = value.map_or("None".to_string(), |v| Self::value(key, v));
        format!("{fmtkey} → {fmtvalue}")
    }
}

/// Formats raw byte slices without any decoding.
pub struct Raw;

impl Raw {
    /// Formats raw bytes as escaped ASCII strings.
    pub fn bytes(bytes: &[u8]) -> String {
        let escaped = bytes.iter().copied().flat_map(std::ascii::escape_default).collect_vec();
        format!("\"{}\"", String::from_utf8_lossy(&escaped))
    }
}

impl Formatter for Raw {
    fn key(key: &[u8]) -> String {
        Self::bytes(key)
    }

    fn value(_key: &[u8], value: &[u8]) -> String {
        Self::bytes(value)
    }
}

/// Formats Raft log entries. Dispatches to F to format each Raft command.
pub struct Raft<F: Formatter>(PhantomData<F>);

impl<F: Formatter> Raft<F> {
    /// Formats a Raft entry.
    pub fn entry(entry: &raft::Entry) -> String {
        let fmtcommand = entry.command.as_deref().map_or("None".to_string(), |c| F::value(&[], c));
        format!("{}@{} {fmtcommand}", entry.index, entry.term)
    }
}

impl<F: Formatter> Formatter for Raft<F> {
    fn key(key: &[u8]) -> String {
        let Ok(key) = raft::Key::decode(key) else {
            return Raw::key(key); // invalid key
        };
        format!("raft:{key:?}")
    }

    fn value(key: &[u8], value: &[u8]) -> String {
        let Ok(key) = raft::Key::decode(key) else {
            return Raw::value(key, value); // invalid key
        };
        match key {
            raft::Key::CommitIndex => {
                match bincode::deserialize::<(raft::Index, raft::Term)>(value) {
                    Ok((index, term)) => format!("{index}@{term}"),
                    Err(_) => Raw::bytes(value),
                }
            }
            raft::Key::TermVote => {
                match bincode::deserialize::<(raft::Term, Option<raft::NodeID>)>(value) {
                    Ok((term, vote)) => format!(
                        "term={term} vote={}",
                        vote.map_or("None".to_string(), |v| v.to_string()),
                    ),
                    Err(_) => Raw::bytes(value),
                }
            }
            raft::Key::Entry(_) => match bincode::deserialize::<raft::Entry>(value) {
                Ok(entry) => Self::entry(&entry),
                Err(_) => Raw::bytes(value),
            },
        }
    }
}

/// Formats MVCC keys/values. Dispatches to F to format the inner key/value.
pub struct MVCC<F: Formatter>(PhantomData<F>);

impl<F: Formatter> Formatter for MVCC<F> {
    fn key(key: &[u8]) -> String {
        let Ok(key) = mvcc::Key::decode(key) else {
            return Raw::key(key); // invalid key
        };
        match key {
            mvcc::Key::TxnWrite(version, innerkey) => {
                format!("mvcc:TxnWrite({version}, {})", F::key(&innerkey))
            }
            mvcc::Key::Version(innerkey, version) => {
                format!("mvcc:Version({}, {version})", F::key(&innerkey))
            }
            mvcc::Key::Unversioned(innerkey) => {
                format!("mvcc:Unversioned({})", F::key(&innerkey))
            }
            mvcc::Key::NextVersion | mvcc::Key::TxnActive(_) | mvcc::Key::TxnActiveSnapshot(_) => {
                format!("mvcc:{key:?}")
            }
        }
    }

    fn value(key: &[u8], value: &[u8]) -> String {
        let Ok(key) = mvcc::Key::decode(key) else {
            return Raw::bytes(value); // invalid key
        };
        match key {
            mvcc::Key::NextVersion => {
                let Ok(version) = bincode::deserialize::<mvcc::Version>(value) else {
                    return Raw::bytes(value);
                };
                version.to_string()
            }
            mvcc::Key::TxnActiveSnapshot(_) => {
                let Ok(active) = bincode::deserialize::<BTreeSet<u64>>(value) else {
                    return Raw::bytes(value);
                };
                format!("{{{}}}", active.iter().join(","))
            }
            mvcc::Key::TxnActive(_) | mvcc::Key::TxnWrite(_, _) => Raw::bytes(value),
            mvcc::Key::Version(userkey, _) => match bincode::deserialize(value) {
                Ok(Some(value)) => F::value(&userkey, value),
                Ok(None) => "None".to_string(),
                Err(_) => Raw::bytes(value),
            },
            mvcc::Key::Unversioned(userkey) => F::value(&userkey, value),
        }
    }
}

/// Formats SQL keys/values.
pub struct SQL;

impl SQL {
    /// Formats a list of SQL values.
    fn values(values: impl IntoIterator<Item = sql::types::Value>) -> String {
        values.into_iter().join(",")
    }

    /// Formats a table schema.
    fn schema(table: sql::types::Table) -> String {
        // Put it all on a single line.
        let re = Regex::new(r#"\n\s*"#).expect("invalid regex");
        re.replace_all(&table.to_string(), " ").into_owned()
    }
}

impl Formatter for SQL {
    fn key(key: &[u8]) -> String {
        // Special-case the Raft applied index key.
        if key == sql::engine::Raft::APPLIED_INDEX_KEY {
            return String::from_utf8_lossy(key).into_owned();
        }
        let Ok(key) = sql::engine::Key::decode(key) else {
            return Raw::key(key); // invalid key
        };
        match key {
            sql::engine::Key::Table(name) => format!("sql:Table({name})"),
            sql::engine::Key::Index(table, column, value) => {
                format!("sql:Index({table}.{column}, {value})")
            }
            sql::engine::Key::Row(table, id) => {
                format!("sql:Row({table}, {id})")
            }
        }
    }

    fn value(key: &[u8], value: &[u8]) -> String {
        // Special-case the applied_index key.
        if key == sql::engine::Raft::APPLIED_INDEX_KEY
            && let Ok(applied_index) = bincode::deserialize::<raft::Index>(value)
        {
            return applied_index.to_string();
        }

        let Ok(key) = sql::engine::Key::decode(key) else {
            return Raw::key(value);
        };
        match key {
            sql::engine::Key::Table(_) => {
                let Ok(table) = bincode::deserialize(value) else {
                    return Raw::bytes(value);
                };
                Self::schema(table)
            }
            sql::engine::Key::Row(_, _) => {
                let Ok(row) = bincode::deserialize::<sql::types::Row>(value) else {
                    return Raw::bytes(value);
                };
                Self::values(row)
            }
            sql::engine::Key::Index(_, _, _) => {
                let Ok(index) = bincode::deserialize::<BTreeSet<sql::types::Value>>(value) else {
                    return Raw::bytes(value);
                };
                Self::values(index)
            }
        }
    }
}

/// Formats SQL Raft write commands, from the Raft log.
pub struct SQLCommand;

impl Formatter for SQLCommand {
    fn key(_key: &[u8]) -> String {
        // There is no key, since these are wrapped in a Raft log entry.
        panic!("SQL commands don't have a key");
    }

    fn value(_key: &[u8], value: &[u8]) -> String {
        let Ok(write) = sql::engine::Write::decode(value) else {
            return Raw::bytes(value);
        };

        let txn = match &write {
            sql::engine::Write::Begin => None,
            sql::engine::Write::Commit(txn)
            | sql::engine::Write::Rollback(txn)
            | sql::engine::Write::Delete { txn, .. }
            | sql::engine::Write::Insert { txn, .. }
            | sql::engine::Write::Update { txn, .. }
            | sql::engine::Write::CreateTable { txn, .. }
            | sql::engine::Write::DropTable { txn, .. } => Some(txn),
        };
        let fmttxn =
            txn.filter(|t| !t.read_only).map_or("".to_string(), |t| format!("t{} ", t.version));

        let fmtcommand = match write {
            sql::engine::Write::Begin => "BEGIN".to_string(),
            sql::engine::Write::Commit(_) => "COMMIT".to_string(),
            sql::engine::Write::Rollback(_) => "ROLLBACK".to_string(),
            sql::engine::Write::Delete { table, ids, .. } => {
                format!("DELETE {table} {}", ids.iter().map(|id| id.to_string()).join(","))
            }
            sql::engine::Write::Insert { table, rows, .. } => {
                format!(
                    "INSERT {table} {}",
                    rows.into_iter().map(|row| format!("({})", SQL::values(row))).join(" ")
                )
            }
            sql::engine::Write::Update { table, rows, .. } => format!(
                "UPDATE {table} {}",
                rows.into_iter().map(|(id, row)| format!("{id}→({})", SQL::values(row))).join(" ")
            ),
            sql::engine::Write::CreateTable { schema, .. } => SQL::schema(schema),
            sql::engine::Write::DropTable { table, .. } => format!("DROP TABLE {table}"),
        };
        format!("{fmttxn}{fmtcommand}")
    }
}


================================================
FILE: src/encoding/keycode.rs
================================================
//! Keycode is a lexicographical order-preserving binary encoding for use with
//! keys in key/value stores. It is designed for simplicity, not efficiency
//! (i.e. it does not use varints or other compression methods).
//!
//! Ordering is important because it allows limited scans across specific parts
//! of the keyspace, e.g. scanning an individual table or using an index range
//! predicate like `WHERE id < 100`. It also avoids sorting in some cases where
//! the keys are already in the desired order, e.g. in the Raft log.
//!
//! The encoding is not self-describing: the caller must provide a concrete type
//! to decode into, and the binary key must conform to its structure.
//!
//! Keycode supports a subset of primitive data types, encoded as follows:
//!
//! * [`bool`]: `0x00` for `false`, `0x01` for `true`.
//! * [`u64`]: big-endian binary representation.
//! * [`i64`]: big-endian binary, sign bit flipped.
//! * [`f64`]: big-endian binary, sign bit flipped, all flipped if negative.
//! * [`Vec<u8>`]: `0x00` escaped as `0x00ff`, terminated with `0x0000`.
//! * [`String`]: like [`Vec<u8>`].
//! * Sequences: concatenation of contained elements, with no other structure.
//! * Enum: the variant's index as [`u8`], then the content sequence.
//! * [`crate::sql::types::Value`]: like any other enum.
//!
//! The canonical key representation is an enum. For example:
//!
//! ```
//! #[derive(Debug, Deserialize, Serialize)]
//! enum Key {
//!     Foo,
//!     Bar(String),
//!     Baz(bool, u64, #[serde(with = "serde_bytes")] Vec<u8>),
//! }
//! ```
//!
//! Unfortunately, byte strings such as `Vec<u8>` must be wrapped with
//! [`serde_bytes::ByteBuf`] or use the `#[serde(with="serde_bytes")]`
//! attribute. See <https://github.com/serde-rs/bytes>.

use std::ops::Bound;

use itertools::Either;
use serde::de::{
    Deserialize, DeserializeSeed, EnumAccess, IntoDeserializer as _, SeqAccess, VariantAccess,
    Visitor,
};
use serde::ser::{Impossible, Serialize, SerializeSeq, SerializeTuple, SerializeTupleVariant};

use crate::errdata;
use crate::error::{Error, Result};

/// Serializes a key to a binary Keycode representation.
///
/// In the common case, the encoded key is borrowed for a storage engine call
/// and then thrown away. We could avoid a bunch of allocations by taking a
/// reusable byte vector to encode into and return a reference to it, but we
/// keep it simple.
pub fn serialize<T: Serialize>(key: &T) -> Vec<u8> {
    let mut serializer = Serializer { output: Vec::new() };
    // Panic on failure, as this is a problem with the data structure.
    key.serialize(&mut serializer).expect("key must be serializable");
    serializer.output
}

/// Deserializes a key from a binary Keycode representation.
pub fn deserialize<'a, T: Deserialize<'a>>(input: &'a [u8]) -> Result<T> {
    let mut deserializer = Deserializer::from_bytes(input);
    let t = T::deserialize(&mut deserializer)?;
    if !deserializer.input.is_empty() {
        return errdata!(
            "unexpected trailing bytes {:x?} at end of key {input:x?}",
            deserializer.input,
        );
    }
    Ok(t)
}

/// Generates a key range for a key prefix, used e.g. for prefix scans.
///
/// The exclusive end bound is generated by adding 1 to the value of the last
/// byte. If the last byte(s) is 0xff (so adding 1 would overflow), we instead
/// find the latest non-0xff byte, increment that, and truncate the rest. If all
/// bytes are 0xff, we scan to the end of the range, since there can't be other
/// prefixes after it.
pub fn prefix_range(prefix: &[u8]) -> (Bound<Vec<u8>>, Bound<Vec<u8>>) {
    let start = Bound::Included(prefix.to_vec());
    let end = match prefix.iter().rposition(|&b| b != 0xff) {
        Some(i) => Bound::Excluded(
            prefix.iter().take(i).copied().chain(std::iter::once(prefix[i] + 1)).collect(),
        ),
        None => Bound::Unbounded,
    };
    (start, end)
}

/// Serializes keys as binary byte vectors.
struct Serializer {
    output: Vec<u8>,
}

impl serde::ser::Serializer for &mut Serializer {
    type Ok = ();
    type Error = Error;

    type SerializeSeq = Self;
    type SerializeTuple = Self;
    type SerializeTupleVariant = Self;
    type SerializeTupleStruct = Impossible<(), Error>;
    type SerializeMap = Impossible<(), Error>;
    type SerializeStruct = Impossible<(), Error>;
    type SerializeStructVariant = Impossible<(), Error>;

    /// bool simply uses 1 for true and 0 for false.
    fn serialize_bool(self, v: bool) -> Result<()> {
        self.output.push(if v { 1 } else { 0 });
        Ok(())
    }

    fn serialize_i8(self, _: i8) -> Result<()> {
        unimplemented!()
    }

    fn serialize_i16(self, _: i16) -> Result<()> {
        unimplemented!()
    }

    fn serialize_i32(self, _: i32) -> Result<()> {
        unimplemented!()
    }

    /// i64 uses the big-endian two's complement encoding, but flips the
    /// left-most sign bit such that negative numbers are ordered before
    /// positive numbers.
    ///
    /// The relative ordering of the remaining bits is already correct: -1, the
    /// largest negative integer, is encoded as 01111111...11111111, ordered
    /// after all other negative integers but before positive integers.
    fn serialize_i64(self, v: i64) -> Result<()> {
        let mut bytes = v.to_be_bytes();
        bytes[0] ^= 1 << 7; // flip sign bit
        self.output.extend(bytes);
        Ok(())
    }

    fn serialize_u8(self, _: u8) -> Result<()> {
        unimplemented!()
    }

    fn serialize_u16(self, _: u16) -> Result<()> {
        unimplemented!()
    }

    fn serialize_u32(self, _: u32) -> Result<()> {
        unimplemented!()
    }

    /// u64 simply uses the big-endian encoding.
    fn serialize_u64(self, v: u64) -> Result<()> {
        self.output.extend(v.to_be_bytes());
        Ok(())
    }

    fn serialize_f32(self, _: f32) -> Result<()> {
        unimplemented!()
    }

    /// f64 is encoded in big-endian IEEE 754 form, but it flips the sign bit to
    /// order positive numbers after negative numbers, and also flips all other
    /// bits for negative numbers to order them from smallest to largest. NaN is
    /// ordered at the end.
    fn serialize_f64(self, v: f64) -> Result<()> {
        let mut bytes = v.to_be_bytes();
        match v.is_sign_negative() {
            false => bytes[0] ^= 1 << 7, // positive, flip sign bit
            true => bytes.iter_mut().for_each(|b| *b = !*b), // negative, flip all bits
        }
        self.output.extend(bytes);
        Ok(())
    }

    fn serialize_char(self, _: char) -> Result<()> {
        unimplemented!()
    }

    // Strings are encoded like bytes.
    fn serialize_str(self, v: &str) -> Result<()> {
        self.serialize_bytes(v.as_bytes())
    }

    // Byte slices are terminated by 0x0000, escaping 0x00 as 0x00ff. This
    // ensures that we can detect the end, and that for two overlapping slices,
    // the shorter one orders before the longer one.
    //
    // We can't use e.g. length prefix encoding, since it doesn't sort correctly.
    fn serialize_bytes(self, v: &[u8]) -> Result<()> {
        let bytes = v
            .iter()
            .flat_map(|&byte| match byte {
                0x00 => Either::Left([0x00, 0xff].into_iter()),
                byte => Either::Right([byte].into_iter()),
            })
            .chain([0x00, 0x00]);
        self.output.extend(bytes);
        Ok(())
    }

    fn serialize_none(self) -> Result<()> {
        unimplemented!()
    }

    fn serialize_some<T: Serialize + ?Sized>(self, _: &T) -> Result<()> {
        unimplemented!()
    }

    fn serialize_unit(self) -> Result<()> {
        unimplemented!()
    }

    fn serialize_unit_struct(self, _: &'static str) -> Result<()> {
        unimplemented!()
    }

    /// Enum variants are serialized using their index, as a single byte.
    fn serialize_unit_variant(self, _: &'static str, index: u32, _: &'static str) -> Result<()> {
        self.output.push(index.try_into()?);
        Ok(())
    }

    fn serialize_newtype_struct<T: Serialize + ?Sized>(self, _: &'static str, _: &T) -> Result<()> {
        unimplemented!()
    }

    /// Newtype variants are serialized using the variant index and inner type.
    fn serialize_newtype_variant<T: Serialize + ?Sized>(
        self,
        name: &'static str,
        index: u32,
        variant: &'static str,
        value: &T,
    ) -> Result<()> {
        self.serialize_unit_variant(name, index, variant)?;
        value.serialize(self)
    }

    /// Sequences are serialized as the concatenation of the serialized elements.
    fn serialize_seq(self, _: Option<usize>) -> Result<Self::SerializeSeq> {
        Ok(self)
    }

    /// Tuples are serialized as the concatenation of the serialized elements.
    fn serialize_tuple(self, _: usize) -> Result<Self::SerializeTuple> {
        Ok(self)
    }

    fn serialize_tuple_struct(
        self,
        _: &'static str,
        _: usize,
    ) -> Result<Self::SerializeTupleStruct> {
        unimplemented!()
    }

    /// Tuple variants are serialized using the variant index and the
    /// concatenation of the serialized elements.
    fn serialize_tuple_variant(
        self,
        name: &'static str,
        index: u32,
        variant: &'static str,
        _: usize,
    ) -> Result<Self::SerializeTupleVariant> {
        self.serialize_unit_variant(name, index, variant)?;
        Ok(self)
    }

    fn serialize_map(self, _: Option<usize>) -> Result<Self::SerializeMap> {
        unimplemented!()
    }

    fn serialize_struct(self, _: &'static str, _: usize) -> Result<Self::SerializeStruct> {
        unimplemented!()
    }

    fn serialize_struct_variant(
        self,
        _: &'static str,
        _: u32,
        _: &'static str,
        _: usize,
    ) -> Result<Self::SerializeStructVariant> {
        unimplemented!()
    }
}

/// Sequences simply concatenate the serialized elements, with no external structure.
impl SerializeSeq for &mut Serializer {
    type Ok = ();
    type Error = Error;

    fn serialize_element<T: Serialize + ?Sized>(&mut self, value: &T) -> Result<()> {
        value.serialize(&mut **self)
    }

    fn end(self) -> Result<()> {
        Ok(())
    }
}

/// Tuples, like sequences, simply concatenate the serialized elements.
impl SerializeTuple for &mut Serializer {
    type Ok = ();
    type Error = Error;

    fn serialize_element<T: Serialize + ?Sized>(&mut self, value: &T) -> Result<()> {
        value.serialize(&mut **self)
    }

    fn end(self) -> Result<()> {
        Ok(())
    }
}

/// Tuples, like sequences, simply concatenate the serialized elements.
impl SerializeTupleVariant for &mut Serializer {
    type Ok = ();
    type Error = Error;

    fn serialize_field<T: Serialize + ?Sized>(&mut self, value: &T) -> Result<()> {
        value.serialize(&mut **self)
    }

    fn end(self) -> Result<()> {
        Ok(())
    }
}

/// Deserializes keys from byte slices into a given type. The format is not
/// self-describing, so the caller must provide a concrete type to deserialize
/// into.
pub struct Deserializer<'de> {
    input: &'de [u8],
}

impl<'de> Deserializer<'de> {
    /// Creates a deserializer for a byte slice.
    pub fn from_bytes(input: &'de [u8]) -> Self {
        Deserializer { input }
    }

    /// Chops off and returns the next len bytes of the byte slice, or errors if
    /// there aren't enough bytes left.
    fn take_bytes(&mut self, len: usize) -> Result<&[u8]> {
        if self.input.len() < len {
            return errdata!("insufficient bytes, expected {len} bytes for {:x?}", self.input);
        }
        let bytes = &self.input[..len];
        self.input = &self.input[len..];
        Ok(bytes)
    }

    /// Decodes and chops off the next encoded byte slice.
    fn decode_next_bytes(&mut self) -> Result<Vec<u8>> {
        let mut decoded = Vec::new();
        let mut iter = self.input.iter().enumerate();
        let taken = loop {
            match iter.next() {
                Some((_, 0x00)) => match iter.next() {
                    Some((i, 0x00)) => break i + 1,        // terminator
                    Some((_, 0xff)) => decoded.push(0x00), // escaped 0x00
                    _ => return errdata!("invalid escape sequence"),
                },
                Some((_, b)) => decoded.push(*b),
                None => return errdata!("unexpected end of input"),
            }
        };
        self.input = &self.input[taken..];
        Ok(decoded)
    }
}

/// For details on serialization formats, see Serializer.
impl<'de> serde::de::Deserializer<'de> for &mut Deserializer<'de> {
    type Error = Error;

    fn deserialize_any<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
        panic!("must provide type, Keycode is not self-describing")
    }

    fn deserialize_bool<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
        visitor.visit_bool(match self.take_bytes(1)?[0] {
            0x00 => false,
            0x01 => true,
            b => return errdata!("invalid boolean value {b}"),
        })
    }

    fn deserialize_i8<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_i16<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_i32<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_i64<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
        let mut bytes = self.take_bytes(8)?.to_vec();
        bytes[0] ^= 1 << 7; // flip sign bit
        visitor.visit_i64(i64::from_be_bytes(bytes.as_slice().try_into()?))
    }

    fn deserialize_u8<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_u16<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_u32<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_u64<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
        visitor.visit_u64(u64::from_be_bytes(self.take_bytes(8)?.try_into()?))
    }

    fn deserialize_f32<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_f64<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
        let mut bytes = self.take_bytes(8)?.to_vec();
        match bytes[0] >> 7 {
            0 => bytes.iter_mut().for_each(|b| *b = !*b), // negative, flip all bits
            1 => bytes[0] ^= 1 << 7,                      // positive, flip sign bit
            _ => panic!("bits can only be 0 or 1"),
        }
        visitor.visit_f64(f64::from_be_bytes(bytes.as_slice().try_into()?))
    }

    fn deserialize_char<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_str<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
        let bytes = self.decode_next_bytes()?;
        visitor.visit_str(&String::from_utf8(bytes)?)
    }

    fn deserialize_string<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
        let bytes = self.decode_next_bytes()?;
        visitor.visit_string(String::from_utf8(bytes)?)
    }

    fn deserialize_bytes<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
        let bytes = self.decode_next_bytes()?;
        visitor.visit_bytes(&bytes)
    }

    fn deserialize_byte_buf<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
        let bytes = self.decode_next_bytes()?;
        visitor.visit_byte_buf(bytes)
    }

    fn deserialize_option<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_unit<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_unit_struct<V: Visitor<'de>>(self, _: &'static str, _: V) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_newtype_struct<V: Visitor<'de>>(
        self,
        _: &'static str,
        _: V,
    ) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_seq<V: Visitor<'de>>(self, visitor: V) -> Result<V::Value> {
        visitor.visit_seq(self)
    }

    fn deserialize_tuple<V: Visitor<'de>>(self, _: usize, visitor: V) -> Result<V::Value> {
        visitor.visit_seq(self)
    }

    fn deserialize_tuple_struct<V: Visitor<'de>>(
        self,
        _: &'static str,
        _: usize,
        _: V,
    ) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_map<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_struct<V: Visitor<'de>>(
        self,
        _: &'static str,
        _: &'static [&'static str],
        _: V,
    ) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_enum<V: Visitor<'de>>(
        self,
        _: &'static str,
        _: &'static [&'static str],
        visitor: V,
    ) -> Result<V::Value> {
        visitor.visit_enum(self)
    }

    fn deserialize_identifier<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
        unimplemented!()
    }

    fn deserialize_ignored_any<V: Visitor<'de>>(self, _: V) -> Result<V::Value> {
        unimplemented!()
    }
}

/// Sequences are simply deserialized until the byte slice is exhausted.
impl<'de> SeqAccess<'de> for Deserializer<'de> {
    type Error = Error;

    fn next_element_seed<T: DeserializeSeed<'de>>(&mut self, seed: T) -> Result<Option<T::Value>> {
        if self.input.is_empty() {
            return Ok(None);
        }
        seed.deserialize(self).map(Some)
    }
}

/// Enum variants are deserialized by their index.
impl<'de> EnumAccess<'de> for &mut Deserializer<'de> {
    type Error = Error;
    type Variant = Self;

    fn variant_seed<V: DeserializeSeed<'de>>(self, seed: V) -> Result<(V::Value, Self::Variant)> {
        let index = self.take_bytes(1)?[0] as u32;
        let value: Result<_> = seed.deserialize(index.into_deserializer());
        Ok((value?, self))
    }
}

/// Enum variant contents are deserialized as sequences.
impl<'de> VariantAccess<'de> for &mut Deserializer<'de> {
    type Error = Error;

    fn unit_variant(self) -> Result<()> {
        Ok(())
    }

    fn newtype_variant_seed<T: DeserializeSeed<'de>>(self, seed: T) -> Result<T::Value> {
        seed.deserialize(&mut *self)
    }

    fn tuple_variant<V: Visitor<'de>>(self, _: usize, visitor: V) -> Result<V::Value> {
        visitor.visit_seq(self)
    }

    fn struct_variant<V: Visitor<'de>>(self, _: &'static [&'static str], _: V) -> Result<V::Value> {
        unimplemented!()
    }
}

#[cfg(test)]
mod tests {
    use std::borrow::Cow;
    use std::f64::consts::PI;

    use paste::paste;
    use serde::{Deserialize, Serialize};
    use serde_bytes::ByteBuf;

    use super::*;
    use crate::sql::types::Value;

    #[derive(Debug, Deserialize, Serialize, PartialEq)]
    enum Key<'a> {
        Unit,
        NewType(String),
        Tuple(bool, #[serde(with = "serde_bytes")] Vec<u8>, u64),
        Cow(
            #[serde(with = "serde_bytes")]
            #[serde(borrow)]
            Cow<'a, [u8]>,
            bool,
            #[serde(borrow)] Cow<'a, str>,
        ),
    }

    /// Assert that serializing a value yields the expected byte sequence (as a
    /// hex-encoded string), and that deserializing it yields the original value.
    macro_rules! test_serialize_deserialize {
        ( $( $name:ident: $input:expr => $expect:literal, )* ) => {
        $(
            #[test]
            fn $name() -> Result<()> {
                let mut input = $input;
                let expect = $expect;
                let output = serialize(&input);
                assert_eq!(hex::encode(&output), expect, "encode failed");

                let expect = input;
                input = deserialize(&output)?; // reuse input variable for proper type
                assert_eq!(input, expect, "decode failed");
                Ok(())
            }
        )*
        };
    }

    /// Assert that deserializing invalid inputs results in errors. Takes byte
    /// slices (as hex-encoded strings) and the type to deserialize into.
    macro_rules! test_deserialize_error {
        ( $( $name:ident: $input:literal as $type:ty, )* ) => {
        paste! {
        $(
            #[test]
            #[should_panic]
            fn [< $name _deserialize_error >]() {
                let bytes = hex::decode($input).unwrap();
                deserialize::<$type>(&bytes).unwrap();
            }
        )*
        }
        };
    }

    // Assert that serializing a value results in an error.
    macro_rules! test_serialize_error {
        ( $( $name:ident: $input:expr, )* ) => {
        paste! {
        $(
            #[test]
            #[should_panic]
            fn [< $name _serialize_error >]() {
                let input = $input;
                serialize(&input);
            }
        )*
        }
        };
    }

    test_serialize_deserialize! {
        bool_false: false => "00",
        bool_true: true => "01",

        f64_min: f64::MIN => "0010000000000000",
        f64_neg_inf: f64::NEG_INFINITY => "000fffffffffffff",
        f64_neg_pi: -PI => "3ff6de04abbbd2e7",
        f64_neg_zero: -0f64 => "7fffffffffffffff",
        f64_zero: 0f64 => "8000000000000000",
        f64_pi: PI => "c00921fb54442d18",
        f64_max: f64::MAX => "ffefffffffffffff",
        f64_inf: f64::INFINITY => "fff0000000000000",
        // We don't test NAN here, since NAN != NAN.

        i64_min: i64::MIN => "0000000000000000",
        i64_neg_65535: -65535i64 => "7fffffffffff0001",
        i64_neg_1: -1i64 => "7fffffffffffffff",
        i64_0: 0i64 => "8000000000000000",
        i64_1: 1i64 => "8000000000000001",
        i64_65535: 65535i64 => "800000000000ffff",
        i64_max: i64::MAX => "ffffffffffffffff",

        u64_min: u64::MIN => "0000000000000000",
        u64_1: 1_u64 => "0000000000000001",
        u64_65535: 65535_u64 => "000000000000ffff",
        u64_max: u64::MAX => "ffffffffffffffff",

        bytes: ByteBuf::from(vec![0x01, 0xff]) => "01ff0000",
        bytes_empty: ByteBuf::new() => "0000",
        bytes_escape: ByteBuf::from(vec![0x00, 0x01, 0x02]) => "00ff01020000",

        string: "foo".to_string() => "666f6f0000",
        string_empty: "".to_string() => "0000",
        string_escape: "foo\x00bar".to_string() => "666f6f00ff6261720000",
        string_utf8: "👋".to_string() => "f09f918b0000",

        tuple: (true, u64::MAX, ByteBuf::from(vec![0x00, 0x01])) => "01ffffffffffffffff00ff010000",
        array_bool: [false, true, false] => "000100",
        vec_bool: vec![false, true, false] => "000100",
        vec_u64: vec![u64::MIN, u64::MAX, 65535_u64] => "0000000000000000ffffffffffffffff000000000000ffff",

        enum_unit: Key::Unit => "00",
        enum_newtype: Key::NewType("foo".to_string()) => "01666f6f0000",
        enum_tuple: Key::Tuple(false, vec![0x00, 0x01], u64::MAX) => "020000ff010000ffffffffffffffff",
        enum_cow: Key::Cow(vec![0x00, 0x01].into(), false, String::from("foo").into()) => "0300ff01000000666f6f0000",
        enum_cow_borrow: Key::Cow([0x00, 0x01].as_slice().into(), false, "foo".into()) => "0300ff01000000666f6f0000",

        value_null: Value::Null => "00",
        value_bool: Value::Boolean(true) => "0101",
        value_int: Value::Integer(-1) => "027fffffffffffffff",
        value_float: Value::Float(PI) => "03c00921fb54442d18",
        value_string: Value::String("foo".to_string()) => "04666f6f0000",
    }

    test_serialize_error! {
        char: 'a',
        f32: 0f32,
        i8: 0i8,
        i16: 0i16,
        i32: 0i32,
        i128: 0i128,
        u8: 0u8,
        u16: 0u16,
        u32: 0u32,
        u128: 0u128,
        some: Some(true),
        none: Option::<bool>::None,
        vec_u8: vec![0u8],
    }

    test_deserialize_error! {
        bool_empty: "" as bool,
        bool_2: "02" as bool,
        char: "61" as char,
        f32: "00000000" as f32,
        i8: "00" as i8,
        i16: "0000" as i16,
        i32: "00000000" as i32,
        i128: "00000000000000000000000000000000" as i128,
        u16: "0000" as u16,
        u32: "00000000" as u32,
        u64_partial: "0000" as u64,
        u128: "00000000000000000000000000000000" as u128,
        option: "00" as Option::<bool>,
        string_utf8_invalid: "c0" as String,
        tuple_partial: "0001" as (bool, bool, bool),
        vec_u8: "0000" as Vec<u8>,
    }
}


================================================
FILE: src/encoding/mod.rs
================================================
//! Binary data encodings.
//!
//! * keycode: used for keys in the key/value store.
//! * bincode: used for values in the key/value store and network protocols.

pub mod bincode;
pub mod format;
pub mod keycode;

use std::cmp::{Eq, Ord};
use std::collections::{BTreeSet, HashSet};
use std::hash::Hash;
use std::io::{Read, Write};

use serde::de::DeserializeOwned;
use serde::{Deserialize, Serialize};

use crate::error::Result;

/// Adds automatic Keycode encode/decode methods to key enums. These are used
/// as keys in the key/value store.
pub trait Key<'de>: Serialize + Deserialize<'de> {
    /// Decodes a key from a byte slice using Keycode.
    fn decode(bytes: &'de [u8]) -> Result<Self> {
        keycode::deserialize(bytes)
    }

    /// Encodes a key to a byte vector using Keycode.
    ///
    /// In the common case, the encoded key is borrowed for a storage engine
    /// call and then thrown away. We could avoid a bunch of allocations by
    /// taking a reusable byte vector to encode into and return a reference to
    /// it, but we keep it simple.
    fn encode(&self) -> Vec<u8> {
        keycode::serialize(self)
    }
}

/// Adds automatic Bincode encode/decode methods to value types. These are used
/// for values in key/value storage engines, and also for e.g. network protocol
/// messages and other values.
pub trait Value: Serialize + DeserializeOwned {
    /// Decodes a value from a byte slice using Bincode.
    fn decode(bytes: &[u8]) -> Result<Self> {
        bincode::deserialize(bytes)
    }

    /// Decodes a value from a reader using Bincode.
    fn decode_from<R: Read>(reader: R) -> Result<Self> {
        bincode::deserialize_from(reader)
    }

    /// Decodes a value from a reader using Bincode, or returns None if the
    /// reader is closed.
    fn maybe_decode_from<R: Read>(reader: R) -> Result<Option<Self>> {
        bincode::maybe_deserialize_from(reader)
    }

    /// Encodes a value to a byte vector using Bincode.
    fn encode(&self) -> Vec<u8> {
        bincode::serialize(self)
    }

    /// Encodes a value into a writer using Bincode.
    fn encode_into<W: Write>(&self, writer: W) -> Result<()> {
        bincode::serialize_into(writer, self)
    }
}

/// Blanket implementations for various types wrapping a value type.
impl<V: Value> Value for Option<V> {}
impl<V: Value> Value for Result<V> {}
impl<V: Value> Value for Vec<V> {}
impl<V1: Value, V2: Value> Value for (V1, V2) {}
impl<V: Value + Eq + Hash> Value for HashSet<V> {}
impl<V: Value + Eq + Ord + Hash> Value for BTreeSet<V> {}


================================================
FILE: src/error.rs
================================================
use std::fmt::Display;

use serde::{Deserialize, Serialize};

/// toyDB errors.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum Error {
    /// The operation was aborted and must be retried. This typically happens
    /// with e.g. Raft leader changes. This is used instead of implementing
    /// complex retry logic and replay protection in Raft.
    Abort,
    /// Invalid data, typically decoding errors or unexpected internal values.
    InvalidData(String),
    /// Invalid user input, typically parser or query errors.
    InvalidInput(String),
    /// An IO error.
    IO(String),
    /// A write was attempted in a read-only transaction.
    ReadOnly,
    /// A write transaction conflicted with a different writer and lost. The
    /// transaction must be retried.
    Serialization,
}

impl std::error::Error for Error {}

impl Display for Error {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
            Error::Abort => write!(f, "operation aborted"),
            Error::InvalidData(msg) => write!(f, "invalid data: {msg}"),
            Error::InvalidInput(msg) => write!(f, "invalid input: {msg}"),
            Error::IO(msg) => write!(f, "io error: {msg}"),
            Error::ReadOnly => write!(f, "read-only transaction"),
            Error::Serialization => write!(f, "serialization failure, retry transaction"),
        }
    }
}

impl Error {
    /// Returns whether the error is considered deterministic. Raft state
    /// machine application needs to know whether a command failure is
    /// deterministic on the input command -- if it is, the command can be
    /// considered applied and the error returned to the client, but otherwise
    /// the state machine must panic to prevent node divergence.
    pub fn is_deterministic(&self) -> bool {
        match self {
            // Aborts don't happen during application, only leader changes. But
            // we consider them non-deterministic in case an abort should happen
            // unexpectedly below Raft.
            Error::Abort => false,
            // Possible data corruption local to this node.
            Error::InvalidData(_) => false,
            // Input errors are (likely) deterministic. They might not be in
            // case data was corrupted in flight, but we ignore this case.
            Error::InvalidInput(_) => true,
            // IO errors are typically local to the node (e.g. faulty disk).
            Error::IO(_) => false,
            // Write commands in read-only transactions are deterministic.
            Error::ReadOnly => true,
            // Write conflicts are determinstic.
            Error::Serialization => true,
        }
    }
}

/// Constructs an Error::InvalidData for the given format string.
#[macro_export]
macro_rules! errdata {
    ($($args:tt)*) => { $crate::error::Error::InvalidData(format!($($args)*)).into() };
}

/// Constructs an Error::InvalidInput for the given format string.
#[macro_export]
macro_rules! errinput {
    ($($args:tt)*) => { $crate::error::Error::InvalidInput(format!($($args)*)).into() };
}

/// A toyDB Result returning Error.
pub type Result<T> = std::result::Result<T, Error>;

impl<T> From<Error> for Result<T> {
    fn from(error: Error) -> Self {
        Err(error)
    }
}

impl serde::de::Error for Error {
    fn custom<T: Display>(msg: T) -> Self {
        Error::InvalidData(msg.to_string())
    }
}

impl serde::ser::Error for Error {
    fn custom<T: Display>(msg: T) -> Self {
        Error::InvalidData(msg.to_string())
    }
}

impl From<bincode::error::DecodeError> for Error {
    fn from(err: bincode::error::DecodeError) -> Self {
        Error::InvalidData(err.to_string())
    }
}

impl From<bincode::error::EncodeError> for Error {
    fn from(err: bincode::error::EncodeError) -> Self {
        Error::InvalidData(err.to_string())
    }
}

impl From<config::ConfigError> for Error {
    fn from(err: config::ConfigError) -> Self {
        Error::InvalidInput(err.to_string())
    }
}

impl From<crossbeam::channel::RecvError> for Error {
    fn from(err: crossbeam::channel::RecvError) -> Self {
        Error::IO(err.to_string())
    }
}

impl<T> From<crossbeam::channel::SendError<T>> for Error {
    fn from(err: crossbeam::channel::SendError<T>) -> Self {
        Error::IO(err.to_string())
    }
}

impl From<crossbeam::channel::TryRecvError> for Error {
    fn from(err: crossbeam::channel::TryRecvError) -> Self {
        Error::IO(err.to_string())
    }
}

impl<T> From<crossbeam::channel::TrySendError<T>> for Error {
    fn from(err: crossbeam::channel::TrySendError<T>) -> Self {
        Error::IO(err.to_string())
    }
}

impl From<hdrhistogram::CreationError> for Error {
    fn from(err: hdrhistogram::CreationError) -> Self {
        panic!("{err}") // faulty code
    }
}

impl From<hdrhistogram::RecordError> for Error {
    fn from(err: hdrhistogram::RecordError) -> Self {
        Error::InvalidInput(err.to_string())
    }
}

impl From<log::ParseLevelError> for Error {
    fn from(err: log::ParseLevelError) -> Self {
        Error::InvalidInput(err.to_string())
    }
}

impl From<log::SetLoggerError> for Error {
    fn from(err: log::SetLoggerError) -> Self {
        panic!("{err}") // faulty code
    }
}

impl From<rand::distr::uniform::Error> for Error {
    fn from(err: rand::distr::uniform::Error) -> Self {
        Error::InvalidInput(err.to_string())
    }
}

impl From<regex::Error> for Error {
    fn from(err: regex::Error) -> Self {
        panic!("{err}") // faulty code
    }
}

impl From<rustyline::error::ReadlineError> for Error {
    fn from(err: rustyline::error::ReadlineError) -> Self {
        Error::IO(err.to_string())
    }
}

impl From<std::array::TryFromSliceError> for Error {
    fn from(err: std::array::TryFromSliceError) -> Self {
        Error::InvalidData(err.to_string())
    }
}

impl From<std::io::Error> for Error {
    fn from(err: std::io::Error) -> Self {
        Error::IO(err.to_string())
    }
}

impl From<std::num::ParseFloatError> for Error {
    fn from(err: std::num::ParseFloatError) -> Self {
        Error::InvalidInput(err.to_string())
    }
}

impl From<std::num::ParseIntError> for Error {
    fn from(err: std::num::ParseIntError) -> Self {
        Error::InvalidInput(err.to_string())
    }
}

impl From<std::num::TryFromIntError> for Error {
    fn from(err: std::num::TryFromIntError) -> Self {
        Error::InvalidData(err.to_string())
    }
}

impl From<std::string::FromUtf8Error> for Error {
    fn from(err: std::string::FromUtf8Error) -> Self {
        Error::InvalidData(err.to_string())
    }
}

impl<T> From<std::sync::PoisonError<T>> for Error {
    fn from(err: std::sync::PoisonError<T>) -> Self {
        // This only happens when a different thread panics while holding a
        // mutex. This should be fatal, so we panic here too.
        panic!("{err}")
    }
}


================================================
FILE: src/lib.rs
================================================
#![warn(clippy::all)]
#![allow(clippy::large_enum_variant)]
#![allow(clippy::module_inception)]
#![allow(clippy::type_complexity)]

pub mod client;
pub mod encoding;
pub mod error;
pub mod raft;
pub mod server;
pub mod sql;
pub mod storage;

pub use client::Client;
pub use server::Server;
pub use sql::execution::StatementResult;


================================================
FILE: src/raft/log.rs
================================================
use std::ops::{Bound, RangeBounds};

use serde::{Deserialize, Serialize};

use super::{NodeID, Term};
use crate::encoding::{self, Key as _, Value as _, bincode};
use crate::error::Result;
use crate::storage;

/// A log index (entry position). Starts at 1. 0 indicates no index.
pub type Index = u64;

/// A log entry containing a state machine command.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Entry {
    /// The entry index.
    ///
    /// We could omit the index in the encoded value, since it's also stored in
    /// the key, but we keep it simple.
    pub index: Index,
    /// The term in which the entry was added.
    pub term: Term,
    /// The state machine command. None (noop) commands are used during leader
    /// election to commit old entries, see section 5.4.2 in the Raft paper.
    pub command: Option<Vec<u8>>,
}

impl encoding::Value for Entry {}

/// A log storage key.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum Key {
    /// A log entry, storing the term and command.
    Entry(Index),
    /// Stores the current term and vote (if any).
    TermVote,
    /// Stores the current commit index (if any).
    CommitIndex,
}

impl encoding::Key<'_> for Key {}

/// The Raft log stores a sequence of arbitrary commands (typically writes) that
/// are replicated across nodes and applied sequentially to the local state
/// machine. Each entry contains an index, command, and the term in which the
/// leader proposed it. Commands may be noops (None), which are added when a
/// leader is elected (see section 5.4.2 in the Raft paper). For example:
///
/// Index | Term | Command
/// ------|------|------------------------------------------------------
///   1   |   1  | None
///   2   |   1  | CREATE TABLE table (id INT PRIMARY KEY, value STRING)
///   3   |   1  | INSERT INTO table VALUES (1, 'foo')
///   4   |   2  | None
///   5   |   2  | UPDATE table SET value = 'bar' WHERE id = 1
///   6   |   2  | DELETE FROM table WHERE id = 1
///
/// Note that this is for illustration only, and the actual toyDB Raft commands
/// are not SQL statements but lower-level write operations.
///
/// A key/value store is used to store the log entries on disk, keyed by index,
/// along with a few other metadata keys (e.g. who we voted for in this term).
///
/// In the steady state, the log is append-only: when a client submits a
/// command, the leader appends it to its own log (via [`Log::append`]) and
/// replicates it to followers who append it to their logs (via
/// [`Log::splice`]). When an index has been replicated to a majority of nodes
/// it becomes committed, making the log immutable up to that index and
/// guaranteeing that all nodes will eventually contain it. Nodes keep track of
/// the commit index via [`Log::commit`] and apply committed commands to the
/// state machine.
///
/// However, uncommitted entries can be replaced or removed. A leader may append
/// entries to its log, but then be unable to reach consensus on them (e.g.
/// because it is unable to communicate with a majority of nodes). If a
/// different leader is elected and writes different commands to those same
/// indexes, then the uncommitted entries will be replaced with entries from the
/// new leader once the old leader (or a follower) discovers it.
///
/// The Raft log has the following invariants:
///
/// * Entry indexes are contiguous starting at 1 (no index gaps).
/// * Entry terms never decrease from the previous entry.
/// * Entry terms are at or below the current term.
/// * Appended entries are durable (flushed to disk).
/// * Appended entries use the current term.
/// * Committed entries are never changed or removed (no log truncation).
/// * Committed entries will eventually be replicated to all nodes.
/// * Entries with the same index/term contain the same command.
/// * If two logs contain a matching index/term, all previous entries
///   are identical (see section 5.3 in the Raft paper).
pub struct Log {
    /// The underlying storage engine. Uses a trait object instead of generics,
    /// to allow runtime selection of the engine and avoid propagating the
    /// generic type parameters throughout Raft.
    pub engine: Box<dyn storage::Engine>,
    /// The current term.
    term: Term,
    /// Our leader vote in the current term, if any.
    vote: Option<NodeID>,
    /// The index of the last stored entry.
    last_index: Index,
    /// The term of the last stored entry.
    last_term: Term,
    /// The index of the last committed entry.
    commit_index: Index,
    /// The term of the last committed entry.
    commit_term: Term,
    /// If true, fsync entries to disk when appended. This is mandated by Raft,
    /// but comes with a hefty performance penalty (especially since we don't
    /// optimize for it by batching entries before fsyncing). Disabling it will
    /// yield much better write performance, but may lose data on crashes, which
    /// in some scenarios can cause log entries to become "uncommitted" and
    /// state machines diverging.
    fsync: bool,
}

impl Log {
    /// Initializes a log using the given storage engine.
    pub fn new(mut engine: Box<dyn storage::Engine>) -> Result<Self> {
        // Load some initial in-memory state from disk.
        let (term, vote) = engine
            .get(&Key::TermVote.encode())?
            .map(|v| bincode::deserialize(&v))
            .transpose()?
            .unwrap_or((0, None));
        let (last_index, last_term) = engine
            .scan_dyn((
                Bound::Included(Key::Entry(0).encode()),
                Bound::Included(Key::Entry(u64::MAX).encode()),
            ))
            .last()
            .transpose()?
            .map(|(_, v)| Entry::decode(&v))
            .transpose()?
            .map(|e| (e.index, e.term))
            .unwrap_or((0, 0));
        let (commit_index, commit_term) = engine
            .get(&Key::CommitIndex.encode())?
            .map(|v| bincode::deserialize(&v))
            .transpose()?
            .unwrap_or((0, 0));

        let fsync = true; // fsync by default
        Ok(Self { engine, term, vote, last_index, last_term, commit_index, commit_term, fsync })
    }

    /// Controls whether to fsync writes. Disabling this may violate Raft
    /// guarantees, see comment on fsync attribute.
    pub fn enable_fsync(&mut self, fsync: bool) {
        self.fsync = fsync
    }

    /// Returns the commit index and term.
    pub fn get_commit_index(&self) -> (Index, Term) {
        (self.commit_index, self.commit_term)
    }

    /// Returns the last log index and term.
    pub fn get_last_index(&self) -> (Index, Term) {
        (self.last_index, self.last_term)
    }

    /// Returns the current term (0 if none) and vote.
    pub fn get_term_vote(&self) -> (Term, Option<NodeID>) {
        (self.term, self.vote)
    }

    /// Stores the current term and cast vote (if any). Enforces that the term
    /// does not regress, and that we only vote for one node in a term. append()
    /// will use this term, and splice() can't write entries beyond it.
    pub fn set_term_vote(&mut self, term: Term, vote: Option<NodeID>) -> Result<()> {
        assert!(term > 0, "can't set term 0");
        assert!(term >= self.term, "term regression {} → {}", self.term, term);
        assert!(term > self.term || self.vote.is_none() || vote == self.vote, "can't change vote");

        if term == self.term && vote == self.vote {
            return Ok(());
        }
        self.engine.set(&Key::TermVote.encode(), bincode::serialize(&(term, vote)))?;
        // Always fsync, even with Log::fsync = false. Term changes are rare, so
        // this doesn't materially affect performance, and double voting could
        // lead to multiple leaders and split brain which is really bad.
        self.engine.flush()?;
        self.term = term;
        self.vote = vote;
        Ok(())
    }

    /// Appends a command to the log at the current term, and flushes it to
    /// disk, returning its index. None implies a noop command, typically after
    /// Raft leader changes.
    pub fn append(&mut self, command: Option<Vec<u8>>) -> Result<Index> {
        assert!(self.term > 0, "can't append entry in term 0");
        let entry = Entry { index: self.last_index + 1, term: self.term, command };
        self.engine.set(&Key::Entry(entry.index).encode(), entry.encode())?;
        if self.fsync {
            self.engine.flush()?;
        }
        self.last_index = entry.index;
        self.last_term = entry.term;
        Ok(entry.index)
    }

    /// Commits entries up to and including the given index. The index must
    /// exist and be at or after the current commit index.
    pub fn commit(&mut self, index: Index) -> Result<Index> {
        let term = match self.get(index)? {
            Some(entry) if entry.index < self.commit_index => {
                panic!("commit index regression {} → {}", self.commit_index, entry.index);
            }
            Some(entry) if entry.index == self.commit_index => return Ok(index),
            Some(entry) => entry.term,
            None => panic!("commit index {index} does not exist"),
        };
        self.engine.set(&Key::CommitIndex.encode(), bincode::serialize(&(index, term)))?;
        // NB: the commit index doesn't need to be fsynced, since the entries
        // are fsynced and the commit index can be recovered from the quorum.
        self.commit_index = index;
        self.commit_term = term;
        Ok(index)
    }

    /// Fetches an entry at an index, or None if it does not exist.
    pub fn get(&mut self, index: Index) -> Result<Option<Entry>> {
        self.engine.get(&Key::Entry(index).encode())?.map(|v| Entry::decode(&v)).transpose()
    }

    /// Checks if the log contains an entry with the given index and term.
    pub fn has(&mut self, index: Index, term: Term) -> Result<bool> {
        // Fast path: check against last_index. This is the common case when
        // followers process appends or heartbeats.
        if index == 0 || index > self.last_index {
            return Ok(false);
        }
        if (index, term) == (self.last_index, self.last_term) {
            return Ok(true);
        }
        Ok(self.get(index)?.map(|e| e.term == term).unwrap_or(false))
    }

    /// Returns an iterator over log entries in the given index range.
    pub fn scan(&mut self, range: impl RangeBounds<Index>) -> Iterator<'_> {
        let from = match range.start_bound() {
            Bound::Excluded(&index) => Bound::Excluded(Key::Entry(index).encode()),
            Bound::Included(&index) => Bound::Included(Key::Entry(index).encode()),
            Bound::Unbounded => Bound::Included(Key::Entry(0).encode()),
        };
        let to = match range.end_bound() {
            Bound::Excluded(&index) => Bound::Excluded(Key::Entry(index).encode()),
            Bound::Included(&index) => Bound::Included(Key::Entry(index).encode()),
            Bound::Unbounded => Bound::Included(Key::Entry(Index::MAX).encode()),
        };
        Iterator::new(self.engine.scan_dyn((from, to)))
    }

    /// Returns an iterator over entries that are ready to apply, starting after
    /// the current applied index up to the commit index.
    pub fn scan_apply(&mut self, applied_index: Index) -> Iterator<'_> {
        // NB: we don't assert that commit_index >= applied_index, because the
        // local commit index is not flushed to durable storage -- if lost on
        // restart, it can be recovered from the logs of a quorum.
        if applied_index >= self.commit_index {
            return Iterator::new(Box::new(std::iter::empty()));
        }
        self.scan(applied_index + 1..=self.commit_index)
    }

    /// Splices a set of entries into the log and flushes it to disk. New
    /// indexes will be appended. Overlapping indexes with the same term must be
    /// equal and will be ignored. Overlapping indexes with different terms will
    /// truncate the existing log at the first conflict and then splice the new
    /// entries.
    ///
    /// The entries must have contiguous indexes and equal/increasing terms, and
    /// the first entry must be in the range [1,last_index+1] with a term at or
    /// above the previous (base) entry's term and at or below the current term.
    pub fn splice(&mut self, entries: Vec<Entry>) -> Result<Index> {
        let (Some(first), Some(last)) = (entries.first(), entries.last()) else {
            return Ok(self.last_index); // empty input is noop
        };

        // Check that the entries are well-formed.
        assert!(first.index > 0 && first.term > 0, "spliced entry has index or term 0",);
        assert!(
            entries.windows(2).all(|w| w[0].index + 1 == w[1].index),
            "spliced entries are not contiguous"
        );
        assert!(
            entries.windows(2).all(|w| w[0].term <= w[1].term),
            "spliced entries have term regression",
        );

        // Check that the entries connect to the existing log (if any), and that the
        // term doesn't regress.
        assert!(last.term <= self.term, "splice term {} beyond current {}", last.term, self.term);
        match self.get(first.index - 1)? {
            Some(base) if first.term < base.term => {
                panic!("splice term regression {} → {}", base.term, first.term)
            }
            Some(_) => {}
            None if first.index == 1 => {}
            None => panic!("first index {} must touch existing log", first.index),
        }

        // Skip entries that are already in the log.
        let mut entries = entries.as_slice();
        let mut scan = self.scan(first.index..=last.index);
        while let Some(entry) = scan.next().transpose()? {
            // [0] is ok, because the scan has the same size as entries.
            assert!(entry.index == entries[0].index, "index mismatch at {entry:?}");
            if entry.term != entries[0].term {
                break;
            }
            assert!(entry.command == entries[0].command, "command mismatch at {entry:?}");
            entries = &entries[1..];
        }
        drop(scan);

        // If all entries already exist then we're done.
        let Some(first) = entries.first() else {
            return Ok(self.last_index);
        };

        // Write the entries that weren't already in the log, and remove the
        // tail of the old log if any. We can't write below the commit index,
        // since these entries must be immutable.
        assert!(first.index > self.commit_index, "spliced entries below commit index");

        for entry in entries {
            self.engine.set(&Key::Entry(entry.index).encode(), entry.encode())?;
        }
        for index in last.index + 1..=self.last_index {
            self.engine.delete(&Key::Entry(index).encode())?;
        }
        if self.fsync {
            self.engine.flush()?;
        }

        self.last_index = last.index;
        self.last_term = last.term;
        Ok(self.last_index)
    }

    /// Returns log engine status.
    pub fn status(&mut self) -> Result<storage::Status> {
        self.engine.status()
    }
}

/// A log entry iterator.
pub struct Iterator<'a> {
    inner: Box<dyn storage::ScanIterator + 'a>,
}

impl<'a> Iterator<'a> {
    fn new(inner: Box<dyn storage::ScanIterator + 'a>) -> Self {
        Self { inner }
    }
}

impl std::iter::Iterator for Iterator<'_> {
    type Item = Result<Entry>;

    fn next(&mut self) -> Option<Self::Item> {
        self.inner.next().map(|r| r.and_then(|(_, v)| Entry::decode(&v)))
    }
}

/// Most Raft tests are Goldenscripts under src/raft/testscripts.
#[cfg(test)]
mod tests {
    use std::error::Error;
    use std::fmt::Write as _;
    use std::result::Result;

    use crossbeam::channel::Receiver;
    use itertools::Itertools as _;
    use regex::Regex;
    use tempfile::TempDir;
    use test_each_file::test_each_path;

    use super::*;
    use crate::encoding::format::{self, Formatter as _};
    use crate::storage::engine::test as testengine;

    // Run goldenscript tests in src/raft/testscripts/log.
    test_each_path! { in "src/raft/testscripts/log" as scripts => test_goldenscript }

    fn test_goldenscript(path: &std::path::Path) {
        goldenscript::run(&mut TestRunner::new(), path).expect("goldenscript failed")
    }

    /// Runs Raft log goldenscript tests. For available commands, see run().
    struct TestRunner {
        log: Log,
        op_rx: Receiver<testengine::Operation>,
        #[allow(dead_code)]
        tempdir: TempDir,
    }

    impl TestRunner {
        fn new() -> Self {
            // Use both a BitCask and a Memory engine, and mirror operations
            // across them. Emit write events to op_tx.
            let (op_tx, op_rx) = crossbeam::channel::unbounded();
            let tempdir = TempDir::with_prefix("toydb").expect("tempdir failed");
            let bitcask =
                storage::BitCask::new(tempdir.path().join("bitcask")).expect("bitcask failed");
            let memory = storage::Memory::new();
            let engine = testengine::Emit::new(testengine::Mirror::new(bitcask, memory), op_tx);
            let log = Log::new(Box::new(engine)).expect("log failed");
            Self { log, op_rx, tempdir }
        }

        /// Parses an index@term pair.
        fn parse_index_term(s: &str) -> Result<(Index, Term), Box<dyn Error>> {
            let re = Regex::new(r"^(\d+)@(\d+)$").expect("invalid regex");
            let groups = re.captures(s).ok_or_else(|| format!("invalid index/term {s}"))?;
            let index = groups.get(1).unwrap().as_str().parse()?;
            let term = groups.get(2).unwrap().as_str().parse()?;
            Ok((index, term))
        }

        /// Parses an index range, in Rust range syntax.
        fn parse_index_range(s: &str) -> Result<impl RangeBounds<Index>, Box<dyn Error>> {
            use std::ops::Bound;
            let mut bound = (Bound::<Index>::Unbounded, Bound::<Index>::Unbounded);
            let re = Regex::new(r"^(\d+)?\.\.(=)?(\d+)?").expect("invalid regex");
            let groups = re.captures(s).ok_or_else(|| format!("invalid range {s}"))?;
            if let Some(start) = groups.get(1) {
                bound.0 = Bound::Included(start.as_str().parse()?);
            }
            if let Some(end) = groups.get(3) {
                let end = end.as_str().parse()?;
                if groups.get(2).is_some() {
                    bound.1 = Bound::Included(end)
                } else {
                    bound.1 = Bound::Excluded(end)
                }
            }
            Ok(bound)
        }
    }

    impl goldenscript::Runner for TestRunner {
        fn run(&mut self, command: &goldenscript::Command) -> Result<String, Box<dyn Error>> {
            let mut output = String::new();
            let mut tags = command.tags.clone();

            match command.name.as_str() {
                // append [COMMAND]
                "append" => {
                    let mut args = command.consume_args();
                    let command = args.next_pos().map(|a| a.value.as_bytes().to_vec());
                    args.reject_rest()?;
                    let index = self.log.append(command)?;
                    let entry = self.log.get(index)?.expect("entry not found");
                    let fmtentry = format::Raft::<format::Raw>::entry(&entry);
                    writeln!(output, "append → {fmtentry}")?;
                }

                // commit INDEX
                "commit" => {
                    let mut args = command.consume_args();
                    let index = args.next_pos().ok_or("index not given")?.parse()?;
                    args.reject_rest()?;
                    let index = self.log.commit(index)?;
                    let entry = self.log.get(index)?.expect("entry not found");
                    let fmtentry = format::Raft::<format::Raw>::entry(&entry);
                    writeln!(output, "commit → {fmtentry}")?;
                }

                // dump
                "dump" => {
                    command.consume_args().reject_rest()?;
                    let range = (std::ops::Bound::Unbounded, std::ops::Bound::Unbounded);
                    let mut scan = self.log.engine.scan_dyn(range);
                    while let Some((key, value)) = scan.next().transpose()? {
                        let fmtkv = format::Raft::<format::Raw>::key_value(&key, &value);
                        let rawkv = format::Raw::key_value(&key, &value);
                        writeln!(output, "{fmtkv} [{rawkv}]")?;
                    }
                }

                // get INDEX...
                "get" => {
                    let mut args = command.consume_args();
                    let indexes: Vec<Index> =
                        args.rest_pos().iter().map(|a| a.parse()).try_collect()?;
                    args.reject_rest()?;
                    for index in indexes {
                        let entry = self.log.get(index)?;
                        let fmtentry = entry
                            .as_ref()
                            .map(format::Raft::<format::Raw>::entry)
                            .unwrap_or("None".to_string());
                        writeln!(output, "{fmtentry}")?;
                    }
                }

                // get_term
                "get_term" => {
                    command.consume_args().reject_rest()?;
                    let (term, vote) = self.log.get_term_vote();
                    let vote = vote.map(|v| v.to_string()).unwrap_or("None".to_string());
                    writeln!(output, "term={term} vote={vote}")?;
                }

                // has INDEX@TERM...
                "has" => {
                    let mut args = command.consume_args();
                    let indexes: Vec<(Index, Term)> = args
                        .rest_pos()
                        .iter()
                        .map(|a| Self::parse_index_term(&a.value))
                        .try_collect()?;
                    args.reject_rest()?;
                    for (index, term) in indexes {
                        let has = self.log.has(index, term)?;
                        writeln!(output, "{has}")?;
                    }
                }

                // reload
                "reload" => {
                    command.consume_args().reject_rest()?;
                    // To get owned access to the inner engine, temporarily
                    // replace it with an empty memory engine.
                    let engine =
                        std::mem::replace(&mut self.log.engine, Box::new(storage::Memory::new()));
                    self.log = Log::new(engine)?;
                }

                // scan [RANGE]
                "scan" => {
                    let mut args = command.consume_args();
                    let range = Self::parse_index_range(
                        args.next_pos().map_or("..", |a| a.value.as_str()),
                    )?;
                    args.reject_rest()?;
                    let mut scan = self.log.scan(range);
                    while let Some(entry) = scan.next().transpose()? {
                        let fmtentry = format::Raft::<format::Raw>::entry(&entry);
                        writeln!(output, "{fmtentry}")?;
                    }
                }

                // scan_apply APPLIED_INDEX
                "scan_apply" => {
                    let mut args = command.consume_args();
                    let applied_index =
                        args.next_pos().ok_or("applied index not given")?.parse()?;
                    args.reject_rest()?;
                    let mut scan = self.log.scan_apply(applied_index);
                    while let Some(entry) = scan.next().transpose()? {
                        let fmtentry = format::Raft::<format::Raw>::entry(&entry);
                        writeln!(output, "{fmtentry}")?;
                    }
                }

                // set_term TERM [VOTE]
                "set_term" => {
                    let mut args = command.consume_args();
                    let term = args.next_pos().ok_or("term not given")?.parse()?;
                    let vote = args.next_pos().map(|a| a.parse()).transpose()?;
                    args.reject_rest()?;
                    self.log.set_term_vote(term, vote)?;
                }

                // splice [INDEX@TERM=COMMAND...]
                "splice" => {
                    let mut args = command.consume_args();
                    let mut entries = Vec::new();
                    for arg in args.rest_key() {
                        let (index, term) = Self::parse_index_term(arg.key.as_deref().unwrap())?;
                        let command = match arg.value.as_str() {
                            "" => None,
                            value => Some(value.as_bytes().to_vec()),
                        };
                        entries.push(Entry { index, term, command });
                    }
                    args.reject_rest()?;
                    let index = self.log.splice(entries)?;
                    let entry = self.log.get(index)?.expect("entry not found");
                    let fmtentry = format::Raft::<format::Raw>::entry(&entry);
                    writeln!(output, "splice → {fmtentry}")?;
                }

                // status [engine=BOOL]
                "status" => {
                    let mut args = command.consume_args();
                    let engine = args.lookup_parse("engine")?.unwrap_or(false);
                    args.reject_rest()?;
                    let (term, vote) = self.log.get_term_vote();
                    let (last_index, last_term) = self.log.get_last_index();
                    let (commit_index, commit_term) = self.log.get_commit_index();
                    let vote = vote.map(|id| id.to_string()).unwrap_or("None".to_string());
                    write!(
                        output,
                        "term={term} last={last_index}@{last_term} commit={commit_index}@{commit_term} vote={vote}",
                    )?;
                    if engine {
                        write!(output, " engine={:#?}", self.log.status()?)?;
                    }
                    writeln!(output)?;
                }

                name => return Err(format!("unknown command {name}").into()),
            }

            // If requested, output engine operations.
            if tags.remove("ops") {
                while let Ok(op) = self.op_rx.try_recv() {
                    match op {
                        testengine::Operation::Delete { key } => {
                            let fmtkey = format::Raft::<format::Raw>::key(&key);
                            let rawkey = format::Raw::key(&key);
                            writeln!(output, "engine delete {fmtkey} [{rawkey}]")?
                        }
                        testengine::Operation::Flush => writeln!(output, "engine flush")?,
                        testengine::Operation::Set { key, value } => {
                            let fmtkv = format::Raft::<format::Raw>::key_value(&key, &value);
                            let rawkv = format::Raw::key_value(&key, &value);
                            writeln!(output, "engine set {fmtkv} [{rawkv}]")?
                        }
                    }
                }
            }

            if let Some(tag) = tags.iter().next() {
                return Err(format!("unknown tag {tag}").into());
            }

            Ok(output)
        }

        /// If requested via [ops] tag, output engine operations for the command.
        fn end_command(&mut self, _: &goldenscript::Command) -> Result<String, Box<dyn Error>> {
            // Drain any remaining engine operations.
            while self.op_rx.try_recv().is_ok() {}
            Ok(String::new())
        }
    }
}


================================================
FILE: src/raft/message.rs
================================================
use std::collections::BTreeMap;

use serde::{Deserialize, Serialize};

use super::{Entry, Index, NodeID, Term};
use crate::encoding;
use crate::error::Result;
use crate::storage;

/// A message envelope specifying the sender and receiver.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Envelope {
    /// The sender.
    pub from: NodeID,
    /// The sender's current term.
    pub term: Term,
    /// The recipient.
    pub to: NodeID,
    /// The message.
    pub message: Message,
}

impl encoding::Value for Envelope {}

/// A message sent between Raft nodes. Messages are sent asynchronously (i.e.
/// they are not request/response) and may be dropped or reordered.
///
/// In practice, they are sent across a TCP connection and crossbeam channel
/// which ensures messages are not dropped or reordered as long as the
/// connection remains intact. A message and its response are sent across
/// separate TCP connections (outbound from the respective sender).
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum Message {
    /// Candidates campaign for leadership by soliciting votes from peers.
    /// Votes will only be granted if the candidate's log is at least as
    /// up-to-date as the voter.
    Campaign {
        /// The index of the candidate's last log entry.
        last_index: Index,
        /// The term of the candidate's last log entry.
        last_term: Term,
    },

    /// Followers may vote for a single candidate per term, but only if the
    /// candidate's log is at least as up-to-date as the follower. Candidates
    /// implicitly vote for themselves.
    CampaignResponse {
        /// If true, the follower granted the candidate a vote. A false response
        /// isn't necessary, but is emitted for clarity.
        vote: bool,
    },

    /// Leaders send periodic heartbeats. This serves several purposes:
    ///
    /// * Inform nodes about the leader, and prevent elections.
    /// * Detect lost appends and reads, as a retry mechanism.
    /// * Advance followers' commit indexes, so they can apply entries.
    ///
    /// The Raft paper does not have a distinct heartbeat message, and instead
    /// uses an empty AppendEntries RPC, but we choose to add one for better
    /// separation of concerns.
    Heartbeat {
        /// The index of the leader's last log entry. The term is the leader's
        /// current term, since it appends a noop entry on election win. The
        /// follower compares this to its own log to determine if it's
        /// up-to-date.
        last_index: Index,
        /// The index of the leader's last committed log entry. Followers use
        /// this to advance their commit index and apply entries. It's only safe
        /// to commit this if the local log matches last_index, such that the
        /// follower's log is identical to the leader at the commit index.
        commit_index: Index,
        /// The leader's latest read sequence number in this term.
        read_seq: ReadSequence,
    },

    /// Followers respond to leader heartbeats if they still consider it leader.
    HeartbeatResponse {
        /// If non-zero, the heartbeat's last_index which was matched in the
        /// follower's log. Otherwise, the follower is either divergent or
        /// lagging behind the leader.
        match_index: Index,
        /// The heartbeat's read sequence number.
        read_seq: ReadSequence,
    },

    /// Leaders replicate log entries to followers by appending to their logs
    /// after the given base entry.
    ///
    /// If the base entry matches the follower's log then their logs are
    /// identical up to it (see section 5.3 in the Raft paper), and the entries
    /// can be appended -- possibly replacing conflicting entries. Otherwise,
    /// the append is rejected and the leader must retry an earlier base index
    /// until a common base is found.
    ///
    /// Empty appends messages (no entries) are used to probe follower logs for
    /// a common match index in the case of divergent logs, restarted nodes, or
    /// dropped messages. This is typically done by sending probes with a
    /// decrementing base index until a match is found, at which point the
    /// subsequent entries can be sent.
    Append {
        /// The index of the log entry to append after.
        base_index: Index,
        /// The  term of the base entry.
        base_term: Term,
        /// Log entries to append. Must start at base_index + 1.
        entries: Vec<Entry>,
    },

    /// Followers accept or reject appends from the leader depending on whether
    /// the base entry matches their log.
    AppendResponse {
        /// If non-zero, the follower appended entries up to this index. The
        /// entire log up to this index is consistent with the leader. If no
        /// entries were sent (a probe), this will be the matching base index.
        match_index: Index,
        /// If non-zero, the follower rejected an append at this base index
        /// because the base index/term did not match its log. If the follower's
        /// log is shorter than the base index, the reject index will be lowered
        /// to the index after its last local index, to avoid probing each
        /// missing index.
        reject_index: Index,
    },

    /// Leaders need to confirm they are still the leader before serving reads,
    /// to guarantee linearizability in case a different leader has been
    /// estalished elsewhere. Read requests are served once the sequence number
    /// has been confirmed by a quorum.
    Read { seq: ReadSequence },

    /// Followers confirm leadership at the read sequence numbers.
    ReadResponse { seq: ReadSequence },

    /// A client request. This can be submitted to the leader, or to a follower
    /// which will forward it to its leader. If there is no leader, or the
    /// leader or term changes, the request is aborted with an Error::Abort
    /// ClientResponse and the client must retry.
    ClientRequest {
        /// The request ID. Must be globally unique for the request duration.
        id: RequestID,
        /// The request itself.
        request: Request,
    },

    /// A client response.
    ClientResponse {
        /// The ID of the original ClientRequest.
        id: RequestID,
        /// The response, or an error.
        response: Result<Response>,
    },
}

/// A client request ID. Must be globally unique while in flight.
///
/// For simplicity, a random UUIDv4 is used. We could incorporate the
/// node/process/MAC ID and timestamp for better collision avoidance (e.g. via
/// UUIDv6) but it doesn't matter at this scale.
pub type RequestID = uuid::Uuid;

/// A read sequence number, used to confirm leadership for linearizable reads.
pub type ReadSequence = u64;

/// A client request, typically passed through to the state machine.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum Request {
    /// A state machine read command, executed via `State::read`. This is not
    /// replicated, and only evaluated on the leader.
    Read(Vec<u8>),
    /// A state machine write command, executed via `State::apply`. This is
    /// replicated across all nodes, and must produce a deterministic result.
    Write(Vec<u8>),
    /// Requests Raft cluster status from the leader.
    Status,
}

impl encoding::Value for Request {}

/// A client response. This will be wrapped in a Result for error handling.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum Response {
    /// A state machine read result.
    Read(Vec<u8>),
    /// A state machine write result.
    Write(Vec<u8>),
    /// The current Raft leader status.
    Status(Status),
}

impl encoding::Value for Response {}

/// Raft cluster status. Generated by the leader.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Status {
    /// The current Raft leader, which generated this status.
    pub leader: NodeID,
    /// The current Raft term.
    pub term: Term,
    /// The match indexes of all nodes, indicating replication progress. Uses a
    /// BTreeMap for test determinism.
    pub match_index: BTreeMap<NodeID, Index>,
    /// The current commit index.
    pub commit_index: Index,
    /// The current applied index.
    pub applied_index: Index,
    /// The log storage engine status.
    pub storage: storage::Status,
}


================================================
FILE: src/raft/mod.rs
================================================
//! Implements the Raft distributed consensus protocol.
//!
//! For details, see Diego Ongaro's original writings:
//!
//! * Raft paper: <https://raft.github.io/raft.pdf>
//! * Raft thesis: <https://web.stanford.edu/~ouster/cgi-bin/papers/OngaroPhD.pdf>
//! * Raft website: <https://raft.github.io>
//!
//! Raft is a protocol for a group of computers to agree on some data -- or more
//! simply, to replicate the data. It is broadly equivalent to [Paxos] and
//! [Viewstamped Replication], but more prescriptive and simpler to understand.
//!
//! Raft has three main properties:
//!
//! * Fault tolerance: the system tolerates node failures as long as a majority
//!   of nodes (>50%) remain operational.
//!
//! * Linearizability (aka strong consistency): once a client write has been
//!   accepted, it is visible to all clients -- they never see outdated data.
//!
//! * Durability: a write is never lost as long as a majority of nodes remain.
//!
//! It does this by electing a single leader node which serves client requests
//! and replicates writes to other nodes. Requests are executed once they have
//! been confirmed by a strict majority of nodes (a quorum). If a leader fails,
//! a new leader is elected. Clusters have 3 or more nodes, since a two-node
//! cluster can't tolerate failures (1/2 is not a majority and would lead to
//! split brain).
//!
//! Notably, Raft does not provide horizontal scalability. Client requests are
//! processed by a single leader node which can quickly become a bottleneck, and
//! each node stores a complete copy of the entire dataset. Systems often handle
//! this by sharding the data into multiple Raft clusters and using a
//! distributed transaction protocol across them, but this is out of scope here.
//!
//! toyDB follows the Raft paper fairly closely, but, like most implementations,
//! takes some minor artistic liberties.
//!
//! [Paxos]: https://www.microsoft.com/en-us/research/uploads/prod/2016/12/paxos-simple-Copy.pdf
//! [Viewstamped Replication]: https://pmg.csail.mit.edu/papers/vr-revisited.pdf
//!
//! RAFT LOG AND STATE MACHINE
//! ==========================
//!
//! Raft maintains an ordered command log containing arbitrary write commands
//! submitted by clients. It attempts to reach consensus on this log by
//! replicating it to a majority of nodes. If successful, the log is considered
//! committed and immutable up to that point.
//!
//! Once committed, the commands in the log are applied sequentially to a local
//! state machine on each node. Raft itself doesn't care what the state machine
//! and commands are -- in toyDB's case it's a SQL database, but it could be
//! anything. Raft simply passes opaque commands to an opaque state machine.
//!
//! Each log entry contains an index, the leader's term (see next section), and
//! the command. For example, a naïve illustration of a toyDB Raft log might be:
//!
//! Index | Term | Command
//! ------|------|------------------------------------------------------
//!   1   |   1  | CREATE TABLE table (id INT PRIMARY KEY, value STRING)
//!   2   |   1  | INSERT INTO table VALUES (1, 'foo')
//!   3   |   2  | UPDATE table SET value = 'bar' WHERE id = 1
//!   4   |   2  | DELETE FROM table WHERE id = 1
//!
//! The state machine must be deterministic, such that all nodes will reach the
//! same identical state. Raft will apply the same commands in the same order
//! independently on all nodes, but if the commands have non-deterministic
//! behavior such as random number generation or communication with external
//! systems it can lead to state divergence causing different results.
//!
//! In toyDB, the Raft log is managed by `Log` and stored locally in a
//! `storage::Engine`. The state machine interface is the `State` trait. See
//! their documentation for more details.
//!
//! LEADER ELECTION
//! ===============
//!
//! Raft nodes can be in one of three states (or roles): follower, candidate,
//! and leader. toyDB models these as `Node::Follower`, `Node::Candidate`, and
//! `Node::Leader`.
//!
//! * Follower: replicates log entries from a leader. May not know a leader yet.
//! * Candidate: campaigns for leadership in an election.
//! * Leader: processes client requests and replicates writes to followers.
//!
//! Raft fundamentally relies on a single guarantee: there can be at most one
//! _valid_ leader at any point in time (old, since-replaced leaders may think
//! they're still a leader, e.g. during a network partition, but they won't be
//! able to do anything). It enforces this through the leader election protocol.
//!
//! Raft divides time into terms, which are monotonically increasing numbers.
//! Higher terms always take priority over lower terms. There can be at most one
//! leader in a term, and it can't change. Nodes keep track of their last known
//! term and store it on disk (see `Log.set_term()`). Messages between nodes are
//! tagged with the current term (as `Envelope.term`) -- old terms are ignored,
//! and future terms cause the node to become a follower in that term.
//!
//! Nodes start out as leaderless followers. If they receive a message from a
//! leader (in a current or future term), they follow it. Otherwise, they wait
//! out the election timeout (a few seconds), become candidates, and hold a
//! leader election.
//!
//! Candidates increase their term by 1 and send `Message::Campaign` to all
//! nodes, requesting their vote. Nodes respond with `Message::CampaignResponse`
//! saying whether a vote was granted. A node can only grant a single vote in a
//! term (stored to disk via `Log.set_term()`), on a first-come first-serve
//! basis, and candidates implicitly vote for themselves.
//!
//! When a candidate receives a majority of votes (>50%), it becomes leader. It
//! sends a `Message::Heartbeat` to all nodes asserting its leadership, and all
//! nodes become followers when they receive it (regardless of who they voted
//! for). Leaders continue to send periodic heartbeats every second or so. The
//! new leader also appends an empty entry to its log in order to safely commit
//! all entries from previous terms (Raft paper section 5.4.2).
//!
//! The new leader must have all committed entries in its log (or the cluster
//! would lose data). To ensure this, there is one additional condition for
//! granting a vote: the candidate's log must be at least as up-to-date as the
//! voter. Because an entry must be replicated to a majority before being
//! committed, this ensures a candidate can only win a majority of votes if its
//! log is up-to-date with all committed entries (Raft paper section 5.4.1).
//!
//! It's possible that no candidate wins an election, for example due to a tie
//! or a majority of nodes being offline. After an election timeout passes,
//! candidates will again bump their term and start a new election, until a
//! leader can be established. To avoid frequent ties, nodes use different,
//! randomized election timeouts (Raft paper section 5.2).
//!
//! Similarly, if a follower doesn't hear from a leader in an election timeout
//! interval, it will become candidate and hold another election. The periodic
//! leader heartbeats prevent this as long as the leader is running and
//! connected. A node that becomes disconnected from the leader will continually
//! hold new elections by itself until the network heals, at which point a new
//! election will be held in its term (disrupting the current leader).
//!
//! REPLICATION AND CONSENSUS
//! =========================
//!
//! When the leader receives a client write request, it appends the command to
//! its local log via `Log.append()`, and sends the log entry to all peers in
//! a `Message::Append`. Followers will attempt to durably append the entry to
//! their local logs and respond with `Message::AppendResponse`.
//!
//! Once a majority have acknowledged the append, the leader commits the entry
//! via `Log.commit()` and applies it to its local state machine, returning the
//! result to the client. It will inform followers about the commit in the next
//! heartbeat as `Message::Heartbeat.commit_index` so they can apply it too, but
//! this is not necessary for correctness (they will commit and apply it if they
//! become leader, otherwise they have no need for applying it).
//!
//! Followers may not be able to append the entry to their log -- they may be
//! unreachable, lag behind the leader, or have divergent logs (see Raft paper
//! section 5.3). The `Append` contains the index and term of the log entry
//! immediately before the replicated entry as `base_index` and `base_term`. An
//! index/term pair uniquely identifies a command, and if two logs have the same
//! index/term pair then the logs are identical up to and including that entry
//! (Raft paper section 5.3). If the base index/term matches the follower's log,
//! it appends the entry (potentially replacing any conflicting entries),
//! otherwise it rejects it.
//!
//! When a follower rejects an append, the leader must try to find a common log
//! entry that exists in both its and the follower's log where it can resume
//! replication. It does this by sending `Message::Append` probes only
//! containing a base index/term but no entries -- it will continue to probe
//! decreasing indexes one by one until the follower responds with a match, then
//! send an `Append` with the missing entries (Raft paper section 5.3). It keeps
//! track of each follower's `match_index` and `next_index` in a `Progress`
//! struct to manage this.
//!
//! In case `Append` messages or responses are lost, leaders also send their
//! `last_index` and term in each `Heartbeat`. If followers don't have that
//! index/term pair in their log, they'll say so in the `HeartbeatResponse` and
//! the leader can begin probing their logs as with append rejections.
//!
//! CLIENT REQUESTS
//! ===============
//!
//! Client requests are submitted as `Message::ClientRequest` to the local Raft
//! node. They are only processed on the leader, but followers will proxy them
//! to the leader (Raft thesis section 6.2). To avoid complications with message
//! replays (Raft thesis section 6.3), requests are not retried internally, and
//! are explicitly aborted with `Error::Abort` on leader/term changes as well as
//! elections.
//!
//! Write requests, `Request::Write`, are appended to the Raft log and
//! replicated. The leader keeps track of the request and its log index in a
//! `Write` struct. Once the command is committed and applied to the local state
//! machine, the leader looks up the write request by its log index and sends
//! the result to the client. Deterministic errors (e.g. foreign key violations)
//! are also returned to the client, but non-deterministic errors (e.g. IO
//! errors) must panic the node to avoid state divergence.
//!
//! Read requests, `Request::Read`, are only executed on the leader and don't
//! need to be replicated via the Raft log. However, to ensure linearizability,
//! the leader has to confirm with a quorum that it's actually still the leader.
//! Otherwise, it's possible that a new leader has been elected elsewhere and
//! executed writes without us knowing about it. It does this by assigning an
//! incrementing sequence number to each read, keeping track of the request in a
//! `Read` struct, and immediately sending a `Read` message with the latest
//! sequence number. Followers respond with the sequence number, and once a
//! quorum have confirmed a sequence number the read is executed and the result
//! returned to the client.
//!
//! IMPLEMENTATION CAVEATS
//! ======================
//!
//! For simplicity, toyDB implements the bare minimum for a functional and
//! correct Raft protocol, and omits several advanced mechanisms that would be
//! needed for a real production system. In particular:
//!
//! * No leases: for linearizability, every read request requires the leader to
//!   confirm with followers that it's still the leader. This could be avoided
//!   with a leader lease for a predefined time interval (Raft paper section 8,
//!   Raft thesis section 6.3).
//!
//! * No cluster membership changes: to add or remove nodes, the entire cluster
//!   must be stopped and restarted with the new configuration, otherwise it
//!   risks multiple leaders (Raft paper section 6).
//!
//! * No snapshots: new or lagging nodes must be caught up by replicating and
//!   replaying the entire log, instead of sending a state machine snapshot
//!   (Raft paper section 7).
//!
//! * No log truncation: because snapshots aren't supported, the entire Raft
//!   log must be retained forever in order to catch up new/lagging nodes,
//!   leading to excessive storage use (Raft paper section 7).
//!
//! * No pre-vote or check-quorum: a node that's partially partitioned (can
//!   reach some but not all nodes) can cause persistent unavailability with
//!   spurious elections or heartbeats. A node rejoining after a partition can
//!   also temporarily disrupt a leader. This requires additional pre-vote and
//!   check-quorum protocol extensions (Raft thesis section 4.2.3 and 9.6).
//!
//! * No request retries: client requests will not be retried on leader changes
//!   or message loss, and will be aggressively aborted, to ignore problems
//!   related to message replay (Raft thesis section 6.3).
//!
//! * No reject hints: if a follower has a divergent log, the leader will probe
//!   entries one by one until a match is found. The replication protocol could
//!   instead be extended with rejection hints (Raft paper section 5.3).

mod log;
mod message;
mod node;
mod state;

use std::ops::Range;
use std::time::Duration;

pub use log::{Entry, Index, Key, Log};
pub use message::{Envelope, Message, ReadSequence, Request, RequestID, Response, Status};
pub use node::{Node, NodeID, Options, Term, Ticks};
pub use state::State;

/// The interval between Raft ticks, the Raft unit of time.
pub const TICK_INTERVAL: Duration = Duration::from_millis(100);

/// The interval between leader heartbeats in ticks.
const HEARTBEAT_INTERVAL: Ticks = 4;

/// The default election timeout range in ticks. To avoid election ties, a node
/// chooses a random value in this interval.
const ELECTION_TIMEOUT_RANGE: Range<Ticks> = 10..20;

/// The maximum number of log entries to send in a single append message.
const MAX_APPEND_ENTRIES: usize = 100;


================================================
FILE: src/raft/node.rs
================================================
use std::cmp::{max, min};
use std::collections::{HashMap, HashSet, VecDeque};
use std::ops::Range;

use crossbeam::channel::Sender;
use itertools::Itertools as _;
use log::{debug, info};
use rand::RngExt as _;

use super::log::{Index, Log};
use super::message::{Envelope, Message, ReadSequence, Request, RequestID, Response, Status};
use super::state::State;
use super::{ELECTION_TIMEOUT_RANGE, HEARTBEAT_INTERVAL, MAX_APPEND_ENTRIES};
use crate::errinput;
use crate::error::{Error, Result};

/// A node ID, unique within a cluster. Assigned manually when started.
pub type NodeID = u8;

/// A leader term number. Increases monotonically on elections.
pub type Term = u64;

/// A logical clock interval as number of ticks.
pub type Ticks = u8;

/// Raft node options.
#[derive(Clone, Debug, PartialEq)]
pub struct Options {
    /// The number of ticks between leader heartbeats.
    pub heartbeat_interval: Ticks,
    /// The range of randomized election timeouts for followers and candidates.
    pub election_timeout_range: Range<Ticks>,
    /// Maximum number of entries to send in a single Append message.
    pub max_append_entries: usize,
}

impl Default for Options {
    fn default() -> Self {
        Self {
            heartbeat_interval: HEARTBEAT_INTERVAL,
            election_timeout_range: ELECTION_TIMEOUT_RANGE,
            max_append_entries: MAX_APPEND_ENTRIES,
        }
    }
}

/// A Raft node with a dynamic role. This implements the Raft distributed
/// consensus protocol, see the `raft` module documentation for more info.
///
/// The node is driven synchronously by processing inbound messages via `step()`
/// and by advancing time via `tick()`. These methods consume the node and
/// return a new one with a possibly different role. Outbound messages are sent
/// via the given `tx` channel, and must be delivered to peers or clients.
///
/// This enum is the public interface to the node, with a closed set of roles.
/// It wraps the `RawNode<Role>` types, which implement the actual node logic.
/// The enum allows ergonomic use across role transitions since it can represent
/// all roles, e.g.: `node = node.step()?`.
pub enum Node {
    /// A candidate campaigns for leadership.
    Candidate(RawNode<Candidate>),
    /// A follower replicates entries from a leader.
    Follower(RawNode<Follower>),
    /// A leader processes client requests and replicates entries to followers.
    Leader(RawNode<Leader>),
}

impl Node {
    /// Creates a new Raft node. It starts as a leaderless follower, waiting to
    /// hear from a leader or otherwise transitioning to candidate and
    /// campaigning for leadership. In the case of a single-node cluster (no
    /// peers), the node immediately transitions to leader when created.
    pub fn new(
        id: NodeID,
        peers: HashSet<NodeID>,
        log: Log,
        state: Box<dyn State>,
        tx: Sender<Envelope>,
        opts: Options,
    ) -> Result<Self> {
        let node = RawNode::new(id, peers, log, state, tx, opts)?;
        // If this is a single-node cluster, become leader immediately.
        if node.cluster_size() == 1 {
            return Ok(node.into_candidate()?.into_leader()?.into());
        }
        Ok(node.into())
    }

    /// Returns the node's ID.
    pub fn id(&self) -> NodeID {
        match self {
            Self::Candidate(node) => node.id,
            Self::Follower(node) => node.id,
            Self::Leader(node) => node.id,
        }
    }

    /// Returns the node's term.
    pub fn term(&self) -> Term {
        match self {
            Self::Candidate(node) => node.term(),
            Self::Follower(node) => node.term(),
            Self::Leader(node) => node.term(),
        }
    }

    /// Processes an inbound message.
    pub fn step(self, msg: Envelope) -> Result<Self> {
        let peers = match &self {
            Self::Candidate(node) => &node.peers,
            Self::Follower(node) => &node.peers,
            Self::Leader(node) => &node.peers,
        };
        assert_eq!(msg.to, self.id(), "message to other node: {msg:?}");
        assert!(peers.contains(&msg.from) || msg.from == self.id(), "unknown sender: {msg:?}");
        debug!("Stepping {msg:?}");

        match self {
            Self::Candidate(node) => node.step(msg),
            Self::Follower(node) => node.step(msg),
            Self::Leader(node) => node.step(msg),
        }
    }

    /// Advances time by a tick.
    pub fn tick(self) -> Result<Self> {
        match self {
            Self::Candidate(node) => node.tick(),
            Self::Follower(node) => node.tick(),
            Self::Leader(node) => node.tick(),
        }
    }
}

impl From<RawNode<Candidate>> for Node {
    fn from(node: RawNode<Candidate>) -> Self {
        Node::Candidate(node)
    }
}

impl From<RawNode<Follower>> for Node {
    fn from(node: RawNode<Follower>) -> Self {
        Node::Follower(node)
    }
}

impl From<RawNode<Leader>> for Node {
    fn from(node: RawNode<Leader>) -> Self {
        Node::Leader(node)
    }
}

/// Marker trait for a Raft role: leader, follower, or candidate.
pub trait Role {}

/// A Raft node with role R.
///
/// This implements the typestate pattern, where individual node states (roles)
/// are encoded as RawNode<Role>. See http://cliffle.com/blog/rust-typestate/.
pub struct RawNode<R: Role> {
    /// The node ID. Must be unique in the cluster.
    id: NodeID,
    /// The IDs of the other nodes in the cluster. Does not change while
    /// running. Can change on restart, but all nodes must have the same set of
    /// nodes, otherwise it can result in multiple leaders (split brain).
    peers: HashSet<NodeID>,
    /// The Raft log, which stores client commands to be executed.
    log: Log,
    /// The Raft state machine, which executes client commands from the log.
    state: Box<dyn State>,
    /// Channel for sending outbound messages to other nodes.
    tx: Sender<Envelope>,
    /// Node options.
    opts: Options,
    /// Role-specific state.
    role: R,
}

impl<R: Role> RawNode<R> {
    /// Helper for role transitions.
    fn into_role<T: Role>(self, role: T) -> RawNode<T> {
        RawNode {
            id: self.id,
            peers: self.peers,
            log: self.log,
            state: self.state,
            tx: self.tx,
            opts: self.opts,
            role,
        }
    }

    /// Returns the node's current term.
    fn term(&self) -> Term {
        self.log.get_term_vote().0
    }

    /// Returns the cluster size as number of nodes.
    fn cluster_size(&self) -> usize {
        self.peers.len() + 1
    }

    /// Returns the cluster quorum size (strict majority).
    fn quorum_size(&self) -> usize {
        self.cluster_size() / 2 + 1
    }

    /// Returns the quorum value (i.e. median) of the given unsorted vector. It
    /// must have the same length as the cluster size.
    fn quorum_value<T: Ord + Copy>(&self, mut values: Vec<T>) -> T {
        assert_eq!(values.len(), self.cluster_size(), "vector size must match cluster size");
        *values.select_nth_unstable_by(self.quorum_size() - 1, |a, b| a.cmp(b).reverse()).1
    }

    /// Generates a random election timeout.
    fn random_election_timeout(&self) -> Ticks {
        rand::rng().random_range(self.opts.election_timeout_range.clone())
    }

    /// Sends a message to the given recipient.
    fn send(&self, to: NodeID, message: Message) -> Result<()> {
        Self::send_via(&self.tx, Envelope { from: self.id, to, term: self.term(), message })
    }

    /// Sends a message via the given channel. This avoid borrowing self, to
    /// allow sending while holding partial borrows of self.
    fn send_via(tx: &Sender<Envelope>, msg: Envelope) -> Result<()> {
        debug!("Sending {msg:?}");
        Ok(tx.send(msg)?)
    }

    /// Broadcasts a message to all peers.
    fn broadcast(&self, message: Message) -> Result<()> {
        // Send in increasing ID order for test determinism.
        for id in self.peers.iter().copied().sorted() {
            self.send(id, message.clone())?;
        }
        Ok(())
    }
}

/// A follower replicates log entries from a leader and forwards client requests
/// to it. Nodes start as leaderless followers, until they either discover a
/// leader or hold an election.
pub struct Follower {
    /// The leader, or None if we're a leaderless follower.
    leader: Option<NodeID>,
    /// The number of ticks since the last message from the leader.
    leader_seen: Ticks,
    /// The leader_seen timeout before triggering an election.
    election_timeout: Ticks,
    // Local client requests that have been forwarded to the leader. These are
    // aborted on leader/term changes.
    forwarded: HashSet<RequestID>,
}

impl Follower {
    /// Creates a new follower role.
    fn new(leader: Option<NodeID>, election_timeout: Ticks) -> Self {
        Self { leader, leader_seen: 0, election_timeout, forwarded: HashSet::new() }
    }
}

impl Role for Follower {}

impl RawNode<Follower> {
    /// Creates a new node as a leaderless follower.
    fn new(
        id: NodeID,
        peers: HashSet<NodeID>,
        log: Log,
        state: Box<dyn State>,
        tx: Sender<Envelope>,
        opts: Options,
    ) -> Result<Self> {
        if peers.contains(&id) {
            return errinput!("node ID {id} can't be in peers");
        }
        let role = Follower::new(None, 0);
        let mut node = Self { id, peers, log, state, tx, opts, role };
        node.role.election_timeout = node.random_election_timeout();

        // Apply any pending entries following restart. State machine writes are
        // not flushed to durable storage, so a tail of writes may be lost if
        // the host crashes or restarts. The Raft log is durable, so we can
        // always recover the state from it. We reapply any missing entries here
        // if that should happen.
        node.maybe_apply()?;
        Ok(node)
    }

    /// Transitions the follower into a candidate, by campaigning for
    /// leadership in a new term.
    fn into_candidate(mut self) -> Result<RawNode<Candidate>> {
        // Abort any forwarded requests. These must be retried with new leader.
        self.abort_forwarded()?;

        // Apply any pending log entries, so that we're caught up if we win.
        self.maybe_apply()?;

        // Become candidate and campaign.
        let election_timeout = self.random_election_timeout();
        let mut node = self.into_role(Candidate::new(election_timeout));
        node.campaign()?;

        let (term, vote) = node.log.get_term_vote();
        assert!(node.role.votes.contains(&node.id), "candidate did not vote for self");
        assert_ne!(term, 0, "candidate can't have term 0");
        assert_eq!(vote, Some(node.id), "log vote does not match self");

        Ok(node)
    }

    /// Transitions the follower into either a leaderless follower in a new term
    /// (e.g. if someone holds a new election) or a follower of a current leader.
    fn into_follower(mut self, term: Term, leader: Option<NodeID>) -> Result<RawNode<Follower>> {
        assert_ne!(term, 0, "can't become follower in term 0");

        // Abort any forwarded requests. These must be retried with new leader.
        self.abort_forwarded()?;

        if let Some(leader) = leader {
            // We found a leader in the current term.
            assert!(self.peers.contains(&leader), "leader is not a peer");
            assert_eq!(self.role.leader, None, "already have leader in term");
            assert_eq!(term, self.term(), "can't follow leader in different term");
            info!("Following leader {leader} in term {term}");
            self.role = Follower::new(Some(leader), self.role.election_timeout);
        } else {
            // We found a new term, but we don't know who the leader is yet.
            // We'll find out if we step a message from it.
            assert_ne!(term, self.term(), "can't become leaderless follower in current term");
            info!("Discovered new term {term}");
            self.log.set_term_vote(term, None)?;
            self.role = Follower::new(None, self.random_election_timeout());
        }
        Ok(self)
    }

    /// Processes an inbound message.
    fn step(mut self, msg: Envelope) -> Result<Node> {
        // Past term: outdated peer, drop the message.
        if msg.term < self.term() {
            debug!("Dropping message from past term: {msg:?}");
            return Ok(self.into());
        }
        // Future term: newer leader or candidate, become leaderless follower
        // and step the message.
        if msg.term > self.term() {
            return self.into_follower(msg.term, None)?.step(msg);
        }

        // Record when we last saw a message from the leader (if any).
        if Some(msg.from) == self.role.leader {
            self.role.leader_seen = 0
        }

        match msg.message {
            // The leader sends periodic heartbeats. If we don't have a leader
            // yet, follow it. If the commit_index advances, apply commands.
            Message::Heartbeat { last_index, commit_index, read_seq } => {
                assert!(commit_index <= last_index, "commit_index after last_index");

                // Make sure the heartbeat is from our leader, or follow it.
                match self.role.leader {
                    Some(leader) => assert_eq!(msg.from, leader, "multiple leaders in term"),
                    None => self = self.into_follower(msg.term, Some(msg.from))?,
                }

                // Check if our log matches the leader's log up to last_index,
                // and respond to the heartbeat. last_index always has the
                // leader's term, since it only appends entries in its term.
                let match_index = if self.log.has(last_index, msg.term)? { last_index } else { 0 };
                self.send(msg.from, Message::HeartbeatResponse { match_index, read_seq })?;

                // Advance the commit index and apply entries. We can only do
                // this if we matched the leader's last_index, which implies
                // that the logs are identical up to match_index. This also
                // implies that the commit_index is present in our log.
                if match_index != 0 && commit_index > self.log.get_commit_index().0 {
                    self.log.commit(commit_index)?;
                    self.maybe_apply()?;
                }
            }

            // Append log entries from the leader to the local log.
            Message::Append { base_index, base_term, entries } => {
                if let Some(first) = entries.first() {
                    assert_eq!(base_index, first.index - 1, "base index mismatch");
                }

                // Make sure the append is from our leader, or follow it.
                match self.role.leader {
                    Some(leader) => assert_eq!(msg.from, leader, "multiple leaders in term"),
                    None => self = self.into_follower(msg.term, Some(msg.from))?,
                }

                // If the base entry matches our log, append the entries.
                if base_index == 0 || self.log.has(base_index, base_term)? {
                    let match_index = entries.last().map(|e| e.index).unwrap_or(base_index);
                    self.log.splice(entries)?;
                    self.send(msg.from, Message::AppendResponse { match_index, reject_index: 0 })?;
                } else {
                    // Otherwise, reject the base index. If the local log is
                    // shorter than the base index, lower the reject index to
                    // skip all missing entries.
                    let reject_index = min(base_index, self.log.get_last_index().0 + 1);
                    self.send(msg.from, Message::AppendResponse { reject_index, match_index: 0 })?;
                }
            }

            // Confirm the leader's read sequence number.
            Message::Read { seq } => {
                // Make sure the read is from our leader, or follow it.
                match self.role.leader {
                    Some(leader) => assert_eq!(msg.from, leader, "multiple leaders in term"),
                    None => self = self.into_follower(msg.term, Some(msg.from))?,
                }

                // Confirm the read.
                self.send(msg.from, Message::ReadResponse { seq })?;
            }

            // A candidate is requesting our vote. We only grant one per term.
            Message::Campaign { last_index, last_term } => {
                // Don't vote if we already voted for someone else in this term.
                // We can repeat our vote for the same node though.
                if let (_, Some(vote)) = self.log.get_term_vote()
                    && msg.from != vote
                {
                    self.send(msg.from, Message::CampaignResponse { vote: false })?;
                    return Ok(self.into());
                }

                // Only vote if the candidate's log is at least as long as ours.
                // At least one node in any quorum must have all committed
                // entries, and this ensures we'll only elect a leader that has
                // all committed entries. See section 5.4.1 in the Raft paper.
                let (log_index, log_term) = self.log.get_last_index();
                if log_term > last_term || log_term == last_term && log_index > last_index {
                    self.send(msg.from, Message::CampaignResponse { vote: false })?;
                    return Ok(self.into());
                }

                // Grant the vote.
                info!("Voting for {} in term {} election", msg.from, msg.term);
                self.log.set_term_vote(msg.term, Some(msg.from))?;
                self.send(msg.from, Message::CampaignResponse { vote: true })?;
            }

            // Forward client requests to the leader, or abort them if there is
            // none. These will not be retried, the client should use timeouts
            // instead.  Local client requests use our node ID as the sender.
            Message::ClientRequest { id, request: _ } => {
                assert_eq!(msg.from, self.id, "client request from other node");

                if let Some(leader) = self.role.leader {
                    debug!("Forwarding request to leader {leader}: {msg:?}");
                    self.role.forwarded.insert(id);
                    self.send(leader, msg.message)?
                } else {
                    let response = Err(Error::Abort);
                    self.send(msg.from, Message::ClientResponse { id, response })?
                }
            }

            // Client responses from the leader are passed on to the client.
            Message::ClientResponse { id, response } => {
                assert_eq!(Some(msg.from), self.role.leader, "client response from non-leader");

                if self.role.forwarded.remove(&id) {
                    self.send(self.id, Message::ClientResponse { id, response })?;
                }
            }

            // We may receive a vote after we lost an election, ignore it.
            Message::CampaignResponse { .. } => {}

            // We're not leader this term, so we shouldn't see these.
            Message::HeartbeatResponse { .. }
            | Message::AppendResponse { .. }
            | Message::ReadResponse { .. } => {
                panic!("follower received unexpected message {msg:?}")
            }
        };
        Ok(self.into())
    }

    /// Processes a logical clock tick.
    fn tick(mut self) -> Result<Node> {
        // Campaign if we haven't heard from the leader in a while.
        self.role.leader_seen += 1;
        if self.role.leader_seen >= self.role.election_timeout {
            return Ok(self.into_candidate()?.into());
        }
        Ok(self.into())
    }

    /// Aborts all forwarded requests (e.g. on term/leader changes).
    fn abort_forwarded(&mut self) -> Result<()> {
        // Sort by ID for test determinism.
        for id in std::mem::take(&mut self.role.forwarded).into_iter().sorted() {
            debug!("Aborting forwarded request {id}");
            self.send(self.id, Message::ClientResponse { id, response: Err(Error::Abort) })?;
        }
        Ok(())
    }

    /// Applies any pending log entries.
    fn maybe_apply(&mut self) -> Result<()> {
        let mut iter = self.log.scan_apply(self.state.get_applied_index());
        while let Some(entry) = iter.next().transpose()? {
            debug!("Applying {entry:?}");
            // Throw away the result, since only the leader responds to clients.
            // This includes errors -- any non-deterministic errors (e.g. IO
            // errors) must panic instead to avoid node divergence.
            _ = self.state.apply(entry);
        }
        Ok(())
    }
}

/// A candidate is campaigning to become a leader.
pub struct Candidate {
    /// Votes received (including our own).
    votes: HashSet<NodeID>,
    /// Ticks elapsed since election start.
    election_duration: Ticks,
    /// Election timeout, in ticks.
    election_timeout: Ticks,
}

impl Candidate {
    /// Creates a new candidate role.
    fn new(election_timeout: Ticks) -> Self {
        Self { votes: HashSet::new(), election_duration: 0, election_timeout }
    }
}

impl Role for Candidate {}

impl RawNode<Candidate> {
    /// Transitions the candidate to a follower. We either lost the election and
    /// follow the winner, or we discovered a new term and step into it as a
    /// leaderless follower.
    fn into_follower(mut self, term: Term, leader: Option<NodeID>) -> Result<RawNode<Follower>> {
        let election_timeout = self.random_election_timeout();
        if let Some(leader) = leader {
            // We lost the election, follow the winner.
            assert_eq!(term, self.term(), "can't follow leader in different term");
            info!("Lost election, following leader {leader} in term {term}");
            Ok(self.into_role(Follower::new(Some(leader), election_timeout)))
        } else {
            // We found a new term, but we don't necessarily know who the leader
            // is yet. We'll find out when we step a message from it.
            assert_ne!(term, self.term(), "can't become leaderless follower in current term");
            info!("Discovered new term {term}");
            self.log.set_term_vote(term, None)?;
            Ok(self.into_role(Follower::new(None, election_timeout)))
        }
    }

    /// Transitions the candidate to a leader. We won the election.
    fn into_leader(self) -> Result<RawNode<Leader>> {
        let (term, vote) = self.log.get_term_vote();
        assert_ne!(term, 0, "leaders can't have term 0");
        assert_eq!(vote, Some(self.id), "leader did not vote for self");

        info!("Won election for term {term}, becoming leader");
        let peers = self.peers.clone();
        let (last_index, _) = self.log.get_last_index();
        let mut node = self.into_role(Leader::new(peers, last_index));

        // Propose an empty command when assuming leadership, to disambiguate
        // previous entries in the log. See section 5.4.2 in the Raft paper.
        // We do this prior to the heartbeat, to avoid a wasted replication
        // roundtrip if the heartbeat response indicates the peer is behind.
        node.propose(None)?;
        node.maybe_commit_and_apply()?;
        node.heartbeat()?;

        Ok(node)
    }

    /// Processes an inbound message.
    fn step(mut self, msg: Envelope) -> Result<Node> {
        // Past term: outdated peer, drop the message.
        if msg.term < self.term() {
            debug!("Dropping message from past term: {msg:?}");
            return Ok(self.into());
        }
        // Future term: newer leader or candidate, become leaderless follower
        // and step the message.
        if msg.term > self.term() {
            return self.into_follower(msg.term, None)?.step(msg);
        }

        match msg.message {
            // If we received a vote, record it. If the vote gives us quorum,
            // assume leadership.
            Message::CampaignResponse { vote: true } => {
                self.role.votes.insert(msg.from);
                if self.role.votes.len() >= self.quorum_size() {
                    return Ok(self.into_leader()?.into());
                }
            }

            // We didn't get the vote. :(
            Message::CampaignResponse { vote: false } => {}

            // Don't grant votes for other candidates.
            Message::Campaign { .. } => {
                self.send(msg.from, Message::CampaignResponse { vote: false })?
            }

            // If we hear from a leader in this term, we lost the election.
            // Follow it and step the message.
            Message::Heartbeat { .. } | Message::Append { .. } | Message::Read { .. } => {
                return self.into_follower(msg.term, Some(msg.from))?.step(msg);
            }

            // Abort client requests while campaigning. The client must retry.
            Message::ClientRequest { id, request: _ } => {
                self.send(msg.from, Message::ClientResponse { id, response: Err(Error::Abort) })?;
            }

            // We're not a leader in this term, nor are we forwarding requests,
            // so we shouldn't see these.
            Message::HeartbeatResponse { .. }
            | Message::AppendResponse { .. }
            | Message::ReadResponse { .. }
            | Message::ClientResponse { .. } => panic!("unexpected message {msg:?}"),
        }
        Ok(self.into())
    }

    /// Processes a logical clock tick.
    fn tick(mut self) -> Result<Node> {
        // If noone won this election, start a new one after a while.
        self.role.election_duration += 1;
        if self.role.election_duration >= self.role.election_timeout {
            self.campaign()?;
        }
        Ok(self.into())
    }

    /// Hold a new election by increasing the term, voting for ourself, and
    /// soliciting votes from all peers.
    fn campaign(&mut self) -> Result<()> {
        let term = self.term() + 1;
        info!("Starting new election for term {term}");
        self.role = Candidate::new(self.random_election_timeout());
        self.role.votes.insert(self.id); // vote for ourself
        self.log.set_term_vote(term, Some(self.id))?;

        let (last_index, last_term) = self.log.get_last_index();
        self.broadcast(Message::Campaign { last_index, last_term })
    }
}

/// A leader serves client requests and replicates the log to followers.
/// If the leader loses leadership, all client requests are aborted.
pub struct Leader {
    /// Follower replication progress.
    progress: HashMap<NodeID, Progress>,
    /// Tracks pending write requests by log index. Added when the write is
    /// proposed and appended to the leader's log, and removed when the command
    /// is applied to the state machine, returning the result to the client.
    writes: HashMap<Index, Write>,
    /// Tracks pending read requests. For linearizability, read requests are
    /// assigned a sequence number and only executed once a quorum of nodes have
    /// confirmed that we're still the leader. Otherwise, an old leader could
    /// serve stale reads if a new leader has been elected elsewhere.
    reads: VecDeque<Read>,
    /// The read sequence number used for the last read. Initialized to 0 in
    /// this term, and incremented for every read command.
    read_seq: ReadSequence,
    /// Number of ticks since last heartbeat.
    since_heartbeat: Ticks,
}

/// Per-follower replication progress (in this term).
struct Progress {
    /// The highest index where the follower's log is known to match the leader.
    /// Initialized to 0, increases monotonically.
    match_index: Index,
    /// The next index to replicate to the follower. Initialized to
    /// last_index+1, decreased when probing log mismatches. Always in
    /// the range [match_index+1, last_index+1].
    ///
    /// Entries not yet sent are in the range [next_index, last_index].
    /// Entries not acknowledged are in the range [match_index+1, next_index).
    next_index: Index,
    /// The last read sequence number confirmed by this follower. To avoid stale
    /// reads on leader changes, a read is only served once its sequence number
    /// is confirmed by a quorum.
    read_seq: ReadSequence,
}

impl Progress {
    /// Attempts to advance a follower's match index, returning true if it did.
    /// If next_index is below it, it is advanced to the following index.
    fn advance(&mut self, match_index: Index) -> bool {
        if match_index <= self.match_index {
            return false;
        }
        self.match_index = match_index;
        self.next_index = max(self.next_index, match_index + 1);
        true
    }

    /// Attempts to advance a follower's read_seq, returning true if it did.
    fn advance_read(&mut self, read_seq: ReadSequence) -> bool {
        if read_seq <= self.read_seq {
            return false;
        }
        self.read_seq = read_seq;
        true
    }

    /// Attempts to regress a follower's next index to the given index, returning
    /// true if it did. Won't regress below match_index + 1.
    fn regress_next(&mut self, next_index: Index) -> bool {
        if next_index >= self.next_index || self.next_index <= self.match_index + 1 {
            return false;
        }
        self.next_index = max(next_index, self.match_index + 1);
        true
    }
}

/// A pending client write request.
struct Write {
    /// The node which submitted the write.
    from: NodeID,
    /// The write request ID.
    id: RequestID,
}

/// A pending client read request.
struct Read {
    /// The sequence number of this read.
    seq: ReadSequence,
    /// The node which submitted the read.
    from: NodeID,
    /// The read request ID.
    id: RequestID,
    /// The read command.
    command: Vec<u8>,
}

impl Leader {
    /// Creates a new leader role.
    fn new(peers: HashSet<NodeID>, last_index: Index) -> Self {
        let next_index = last_index + 1;
        let progress = peers
            .into_iter()
            .map(|p| (p, Progress { next_index, match_index: 0, read_seq: 0 }))
            .collect();
        Self {
            progress,
            writes: HashMap::new(),
            reads: VecDeque::new(),
            read_seq: 0,
            since_heartbeat: 0,
        }
    }
}

impl Role for Leader {}

impl RawNode<Leader> {
    /// Transitions the leader into a follower. This can only happen if we
    /// discover a new term, so we become a leaderless follower. Stepping the
    /// received message may then follow a new leader, if there is one.
    fn into_follower(mut self, term: Term) -> Result<RawNode<Follower>> {
        assert!(term > self.term(), "leader can only become follower in later term");
        info!("Discovered new term {term}");

        // Abort in-flight requests. The client must retry. Sort the requests
        // by ID for test determinism.
        for write in std::mem::take(&mut self.role.writes).into_values().sorted_by_key(|w| w.id) {
            let response = Err(Error::Abort);
            self.send(write.from, Message::ClientResponse { id: write.id, response })?;
        }
        for read in std::mem::take(&mut self.role.reads).into_iter().sorted_by_key(|r| r.id) {
            let response = Err(Error::Abort);
            self.send(read.from, Message::ClientResponse { id: read.id, response })?;
        }

        self.log.set_term_vote(term, None)?;
        let election_timeout = self.random_election_timeout();
        Ok(self.into_role(Follower::new(None, election_timeout)))
    }

    /// Processes an inbound message.
    fn step(mut self, msg: Envelope) -> Result<Node> {
        // Past term: outdated peer, drop the message.
        if msg.term < self.term() {
            debug!("Dropping message from past term: {msg:?}");
            return Ok(self.into());
        }
        // Future term: become leaderless follower and step the message.
        if msg.term > self.term() {
            return self.into_follower(msg.term)?.step(msg);
        }

        match msg.message {
            // A follower received our heartbeat and confirms our leadership.
            // We may be able to execute new reads, and we may find that the
            // follower's log is lagging and requires us to catch it up.
            Message::HeartbeatResponse { match_index, read_seq } => {
                let (last_index, _) = self.log.get_last_index();
                assert!(match_index <= last_index, "future match index");
                assert!(read_seq <= self.role.read_seq, "future read sequence number");

                // If the read sequence number advances, try to execute reads.
                if self.progress(msg.from).advance_read(read_seq) {
                    self.maybe_read()?;
                }

                // If the follower didn't match our last index, an append to it
                // must have failed (or it's catching up). Probe it to discover
                // a matching entry and start replicating. Move next_index back
                // to last_index since the follower just told us it doesn't have
                // it (or a previous last_index).
                if match_index == 0 {
                    self.progress(msg.from).regress_next(last_index);
                    self.maybe_send_append(msg.from, true)?;
                }

                // If the follower's match index advances, an append response
                // got lost. Try to commit and apply.
                //
                // We don't need to eagerly send any pending entries, since any
                // proposals made after this heartbeat was sent should have been
                // eagerly replicated in steady state. If not, the next
                // heartbeat will trigger a probe above.
                if self.progress(msg.from).advance(match_index) {
                    self.maybe_commit_and_apply()?;
                }
            }

            // A follower appended our log entries (or a probe found a match).
            // Record its progress and attempt to commit and apply.
            Message::AppendResponse { match_index, reject_index: 0 } if match_index > 0 => {
                let (last_index, _) = self.log.get_last_index();
                assert!(match_index <= last_index, "future match index");

                if self.progress(msg.from).advance(match_index) {
                    self.maybe_commit_and_apply()?;
                }

                // Eagerly send any further pending entries. This may be a
                // successful probe response, or the peer may be lagging and
                // we're catching it up one MAX_APPEND_ENTRIES batch at a time.
                self.maybe_send_append(msg.from, false)?;
            }

            // A follower confirmed our read sequence number. If it advances,
            // try to execute reads.
            Message::ReadResponse { seq } => {
                if self.progress(msg.from).advance_read(seq) {
                    self.maybe_read()?;
                }
            }

            // A follower rejected an append because the base entry in
            // reject_index did not match its log. Probe the previous entry by
            // sending an empty append until we find a common base.
            //
            // This linear probing can be slow with long divergent logs, but we
            // keep it simple. See also section 5.3 in the Raft paper.
            Message::AppendResponse { reject_index, match_index: 0 } if reject_index > 0 => {
                let (last_index, _) = self.log.get_last_index();
                assert!(reject_index <= last_index, "future reject index");

                // If the rejected base index is at or below the match index,
                // the rejection is stale and can be ignored.
                if reject_index <= self.progress(msg.from).match_index {
                    return Ok(self.into());
                }

                // Probe below the reject index, if we haven't already moved
                // next_index below it. This avoids sending duplicate probes
                // (heartbeats will trigger retries if they're lost).
                if self.progress(msg.from).regress_next(reject_index) {
                    self.maybe_send_append(msg.from, true)?;
                }
            }

            // AppendResponses must set either match_index or reject_index.
            Message::AppendResponse { .. } => panic!("invalid message {msg:?}"),

            // A client submitted a write request. Propose it, and wait until
            // it's replicated and applied to the state machine before returning
            // the response to the client.
            Message::ClientRequest { id, request: Request::Write(command) } => {
                let index = self.propose(Some(command))?;
                self.role.writes.insert(index, Write { from: msg.from, id });
                if self.cluster_size() == 1 {
                    self.maybe_commit_and_apply()?;
                }
            }

            // A client submitted a read request. To ensure linearizability, we
            // must confirm that we are still the leader by sending the read's
            // sequence number and wait for quorum confirmation.
            Message::ClientRequest { id, request: Request::Read(command) } => {
                self.role.read_seq += 1;
                let read = Read { seq: self.role.read_seq, from: msg.from, id, command };
                self.role.reads.push_back(read);
                self.broadcast(Message::Read { seq: self.role.read_seq })?;
                if self.cluster_size() == 1 {
                    self.maybe_read()?;
                }
            }

            // A client submitted a status command.
            Message::ClientRequest { id, request: Request::Status } => {
                let response = self.status().map(Response::Status);
                self.send(msg.from, Message::ClientResponse { id, response })?;
            }

            // Don't grant any votes (we've already voted for ourself).
            Message::Campaign { .. } => {
                self.send(msg.from, Message::CampaignResponse { vote: false })?
            }

            // Votes can come in after we won the election, ignore them.
            Message::CampaignResponse { .. } => {}

            // There can't be another leader in this term.
            Message::Heartbeat { .. } | Message::Append { .. } | Message::Read { .. } => {
                panic!("saw other leader {} in term {}", msg.from, msg.term);
            }

            // Leaders don't proxy client requests.
            Message::ClientResponse { .. } => panic!("unexpected message {msg:?}"),
        }

        Ok(self.into())
    }

    /// Processes a logical clock tick.
    fn tick(mut self) -> Result<Node> {
        // Send periodic heartbeats.
        self.role.since_heartbeat += 1;
        if self.role.since_heartbeat >= self.opts.heartbeat_interval {
            self.heartbeat()?;
        }
        Ok(self.into())
    }

    /// Broadcasts a heartbeat to all peers.
    fn heartbeat(&mut self) -> Result<()> {
        let (last_index, last_term) = self.log.get_last_index();
        let (commit_index, _) = self.log.get_commit_index();
        let read_seq = self.role.read_seq;
        assert_eq!(last_term, self.term(), "leader's last_term not in current term");

        self.role.since_heartbeat = 0;
        self.broadcast(Message::Heartbeat { last_index, commit_index, read_seq })
    }

    /// Proposes a command for consensus by appending it to our log and
    /// replicating it to peers. If successful, it will eventually be committed
    /// and applied to the state machine.
    fn propose(&mut self, command: Option<Vec<u8>>) -> Result<Index> {
        let index = self.log.append(command)?;
        for peer in self.peers.iter().copied().sorted() {
            // Eagerly send the entry to the peer if it's in steady state and
            // we've sent all previous entries. Otherwise, the peer is lagging
            // and we're probing past entries for a match.
            if index == self.progress(peer).next_index {
                self.maybe_send_append(peer, false)?;
            }
        }
        Ok(index)
    }

    /// Commits new entries that have been replicated to a quorum and applies
    /// them to the state machine, returning results to clients.
    fn maybe_commit_and_apply(&mut self) -> Result<Index> {
        // Determine the new commit index by quorum.
        let (last_index, _) = self.log.get_last_index();
        let commit_index = self.quorum_value(
            self.role.progress.values().map(|p| p.match_index).chain([last_index]).collect(),
        );

        // If the commit index doesn't advance, do nothing. We don't assert on
        // this, since the quorum value may regress e.g. following a restart or
        // leader change where followers are initialized with match index 0.
        let (old_index, old_term) = self.log.get_commit_index();
        if commit_index <= old_index {
            return Ok(old_index);
        }

        // We can only safely commit an entry from our own term (see section
        // 5.4.2 in Raft paper).
        match self.log.get(commit_index)? {
            Some(entry) if entry.term == self.term() => {}
            Some(_) => return Ok(old_index),
            None => panic!("commit index {commit_index} missing"),
        }

        // Commit entries.
        self.log.commit(commit_index)?;

        // Apply entries and respond to clients.
        let term = self.term();
        let mut iter = self.log.scan_apply(self.state.get_applied_index());
        while let Some(entry) = iter.next().transpose()? {
            debug!("Applying {entry:?}");
            let write = self.role.writes.remove(&entry.index);
            let result = self.state.apply(entry);

            if let Some(Write { id, from: to }) = write {
                let message = Message::ClientResponse { id, response: result.map(Response::Write) };
                Self::send_via(&self.tx, Envelope { from: self.id, term, to, message })?;
            }
        }
        drop(iter);

        // If the commit term changed, there may be pending reads waiting for us
        // to commit and apply an entry from our own term. Execute them.
        if old_term != self.term() {
            self.maybe_read()?;
        }

        Ok(commit_index)
    }

    /// Executes any ready read requests, where a quorum have confirmed that
    /// we're still the leader for the read sequences.
    fn maybe_read(&mut self) -> Result<()> {
        if self.role.reads.is_empty() {
            return Ok(());
        }

        // It's only safe to read if we've committed and applied an entry from
        // our own term (the leader appends an entry when elected). Otherwise we
        // may be behind on application and serve stale reads.
        let (commit_index, commit_term) = self.log.get_commit_index();
        let applied_index = self.state.get_applied_index();
        if commit_term < self.term() || applied_index < commit_index {
            return Ok(());
        }

        // Determine the maximum read sequence confirmed by quorum.
        let quorum_read_seq = self.quorum_value(
            self.role.progress.values().map(|p| p.read_seq).chain([self.role.read_seq]).collect(),
        );

        // Execute ready reads. The VecDeque is ordered by read_seq, so we
        // can keep pulling until we hit quorum_read_seq.
        while let Some(read) = self.role.reads.front() {
            if read.seq > quorum_read_seq {
                break;
            }
            let read = self.role.reads.pop_front().unwrap();
            let response = self.state.read(read.command).map(Response::Read);
            self.send(read.from, Message::ClientResponse { id: read.id, response })?;
        }
        Ok(())
    }

    /// Sends a batch of pending log entries to a follower, in the
    /// [next_index,last_index] range. Limited by max_append_entries.
    ///
    /// If probe is true, we're trying to find a log index on the follower where
    /// it matches our log. To do this, we send an empty append probe with
    /// base_index of next_index-1. If the follower confirms the base_index
    /// matches its log, the actual entries are sent next -- otherwise,
    /// next_index is decremented and another probe is sent until a match is
    /// found. See section 5.3 in the Raft paper.
    ///
    /// The probe is skipped if the follower is up-to-date (according to
    /// match_index and last_index). If the probe's base_index has already been
    /// confirmed via match_index, an actual append is sent instead.
    fn maybe_send_append(&mut self, peer: NodeID, mut probe: bool) -> Result<()> {
        let (last_index, _) = self.log.get_last_index();
        let progress = self.role.progress.get_mut(&peer).expect("unknown node");
        assert_ne!(progress.next_index, 0, "invalid next_index");
        assert!(progress.next_index > progress.match_index, "invalid next_index <= match_index");
        assert!(progress.match_index <= last_index, "invalid match_index > last_index");
        assert!(progress.next_index <= last_index + 1, "invalid next_index > last_index + 1");

        // If the peer is caught up, there's no point sending an append.
        if progress.match_index == last_index {
            return Ok(());
        }

        // If a probe was requested, but the base_index has already been
        // confirmed via match_index, there is no point in probing. Just send
        // the entries instead.
        probe = probe && progress.next_index > progress.match_index + 1;

        // If there are no pending entries, and this is not a probe, there's
        // nothing more to send until we get a response from the follower.
        if progress.next_index > last_index && !probe {
            return Ok(());
        }

        // Fetch the base and entries.
        let (base_index, base_term) = match progress.next_index {
            0 => panic!("next_index=0 for node {peer}"),
            1 => (0, 0), // first entry, there is no base
            next => self.log.get(next - 1)?.map(|e| (e.index, e.term)).expect("missing base entry"),
        };
        let entries = match probe {
            false => self
                .log
                .scan(progress.next_index..)
                .take(self.opts.max_append_entries)
                .try_collect()?,
            true => Vec::new(),
        };

        // Optimistically assume the entries will be accepted by the follower,
        // and bump next_index to avoid resending them until a response.
        if let Some(last) = entries.last() {
            progress.next_index = last.index + 1;
        }

        debug!("Replicating {} entries with base {base_index} to {peer}", entries.len());
        self.send(peer, Message::Append { base_index, base_term, entries })
    }

    /// Generates cluster status.
    fn status(&mut self) -> Result<Status> {
        Ok(Status {
            leader: self.id,
            term: self.term(),
            match_index: self
                .role
                .progress
                .iter()
                .map(|(id, p)| (*id, p.match_index))
                .chain(std::iter::once((self.id, self.log.get_last_index().0)))
                .collect(),
            commit_index: self.log.get_commit_index().0,
            applied_index: self.state.get_applied_index(),
            storage: self.log.status()?,
        })
    }

    /// Returns a mutable borrow of a node's progress. Convenience method.
    fn progress(&mut self, id: NodeID) -> &mut Progress {
        self.role.progress.get_mut(&id).expect("unknown node")
    }
}

/// Most Raft tests are Goldenscripts under src/raft/testscripts.
#[cfg(test)]
mod tests {
    use std::borrow::Borrow;
    use std::error::Error;
    use std::fmt::Write as _;
    use std::path::Path;
    use std::result::Result;

    use crossbeam::channel::Receiver;
    use tempfile::TempDir;
    use test_case::test_case;
    use test_each_file::test_each_path;
    use uuid::Uuid;

    use super::*;
    use crate::encoding::{Key as _, Value as _, bincode};
    use crate::raft::Entry;
    use crate::raft::state::test::{self as teststate, KVCommand, KVResponse};
    use crate::storage;
    use crate::storage::engine::test as testengine;

    // Run goldenscript tests in src/raft/testscripts/node.
    test_each_path! { in "src/raft/testscripts/node" as scripts => test_goldenscript }

    fn test_goldenscript(path: &Path) {
        goldenscript::run(&mut TestRunner::new(), path).expect("goldenscript failed")
    }

    /// Tests RawNode.quorum_size() and cluster_size().
    #[test_case(1 => 1)]
    #[test_case(2 => 2)]
    #[test_case(3 => 2)]
    #[test_case(4 => 3)]
    #[test_case(5 => 3)]
    #[test_case(6 => 4)]
    #[test_case(7 => 4)]
    #[test_case(8 => 5)]
    fn quorum_size(size: usize) -> usize {
        let node = RawNode::new_noop(1, (2..=size as NodeID).collect());
        assert_eq!(node.cluster_size(), size);
        node.quorum_size()
    }

    /// Tests RawNode.quorum_value().
    #[test_case(vec![1] => 1)]
    #[test_case(vec![1,3,2] => 2)]
    #[test_case(vec![4,1,3,2] => 2)]
    #[test_case(vec![1,1,1,2,2] => 1)]
    #[test_case(vec![1,1,2,2,2] => 2)]
    fn quorum_value(values: Vec<i8>) -> i8 {
        let size = values.len();
        let node = RawNode::new_noop(1, (2..=size as NodeID).collect());
        assert_eq!(node.cluster_size(), size);
        node.quorum_value(values)
    }

    /// Test helpers for RawNode.
    impl RawNode<Follower> {
        /// Creates a noop node, with a noop state machine and transport.
        fn new_noop(id: NodeID, peers: HashSet<NodeID>) -> Self {
            let log = Log::new(Box::new(storage::Memory::new())).expect("log failed");
            let state = teststate::Noop::new();
            let (tx, _) = crossbeam::channel::unbounded();
            RawNode::new(id, peers, log, state, tx, Options::default()).expect("node failed")
        }
    }

    /// Helper macro which calls a closure on the inner RawNode<Role>.
    macro_rules! with_rawnode {
        // Node is moved.
        ($node:expr, $closure:expr) => {{
            fn with<R: Role, T>(node: RawNode<R>, f: impl FnOnce(RawNode<R>) -> T) -> T {
                f(node)
            }
            match $node {
                Node::Candidate(node) => with(node, $closure),
                Node::Follower(node) => with(node, $closure),
                Node::Leader(node) => with(node, $closure),
            }
        }};
        // Node is borrowed (ref).
        (ref $node:expr, $closure:expr) => {{
            fn with<R: Role, T>(node: &RawNode<R>, f: impl FnOnce(&RawNode<R>) -> T) -> T {
                f(node)
            }
            match $node {
                &Node::Candidate(ref node) => with(node, $closure),
                &Node::Follower(ref node) => with(node, $closure),
                &Node::Leader(ref node) => with(node, $closure),
            }
        }};
        // Node is mutably borrowed (ref mut).
        (ref mut $node:expr, $closure:expr) => {{
            fn with<R: Role, T>(node: &mut RawNode<R>, f: impl FnOnce(&mut RawNode<R>) -> T) -> T {
                f(node)
            }
            match $node {
                &mut Node::Candidate(ref mut node) => with(node, $closure),
                &mut Node::Follower(ref mut node) => with(node, $closure),
                &mut Node::Leader(ref mut node) => with(node, $closure),
            }
        }};
    }

    /// Test helpers for Node.
    impl Node {
        fn dismantle(self) -> (Log, Box<dyn State>) {
            with_rawnode!(self, |n| (n.log, n.state))
        }

        fn get_applied_index(&self) -> Index {
            with_rawnode!(ref self, |n| n.state.get_applied_index())
        }

        fn get_commit_index(&self) -> (Index, Term) {
            with_rawnode!(ref self, |n| n.log.get_commit_index())
        }

        fn get_last_index(&self) -> (Index, Term) {
            with_rawnode!(ref self, |n| n.log.get_last_index())
        }

        fn get_term_vote(&self) -> (Term, Option<NodeID>) {
            with_rawnode!(ref self, |n| n.log.get_term_vote())
        }

        fn options(&self) -> Options {
            with_rawnode!(ref self, |n| n.opts.clone())
        }

        fn peers(&self) -> HashSet<NodeID> {
            with_rawnode!(ref self, |n| n.peers.clone())
        }

        fn read(&self, command: Vec<u8>) -> crate::error::Result<Vec<u8>> {
            with_rawnode!(ref self, |n| n.state.read(command))
        }

        fn scan_log(&mut self) -> crate::error::Result<Vec<Entry>> {
            with_rawnode!(ref mut self, |n| n.log.scan(..).collect())
        }
    }

    /// Runs Raft goldenscript tests. See run() for available commands.
    struct TestRunner {
        /// IDs of all cluster nodes, in order.
        ids: Vec<NodeID>,
        /// The cluster nodes, keyed by node ID.
        nodes: HashMap<NodeID, Node>,
        /// Outbound send queues from each node.
        nodes_rx: HashMap<NodeID, Receiver<Envelope>>,
        /// Inbound receive queues to each node, to be stepped.
        nodes_pending: HashMap<NodeID, Vec<Envelope>>,
        /// Applied log entries for each node, after state machine application.
        applied_rx: HashMap<NodeID, Receiver<Entry>>,
        /// Network partitions (sender → receivers). A symmetric (bidirectional)
        /// partition needs an entry from each side.
        disconnected: HashMap<NodeID, HashSet<NodeID>>,
        /// In-flight client requests.
        requests: HashMap<RequestID, Request>,
        /// The request ID to use for the next client request.
        next_request_id: u64,
        /// Temporary directory (deleted when dropped).
        tempdir: TempDir,
    }

    impl goldenscript::Runner for TestRunner {
        /// Runs a goldenscript command.
        fn run(&mut self, command: &goldenscript::Command) -> Result<String, Box<dyn Error>> {
            let mut output = String::new();
            match command.name.as_str() {
                // campaign [ID...]
                // Transition the given nodes to candidates and campaign.
                "campaign" => {
                    let ids = self.parse_ids_or_all(&command.args)?;
                    self.campaign(&ids, &mut output)?;
                }

                // cluster nodes=N [leader=ID] [heartbeat_interval=N] [election_timeout=N] [max_append_entries=N]
                // Creates a new Raft cluster.
                "cluster" => {
                    let mut opts = Options::default();
                    let mut args = command.consume_args();
                    let nodes = args.lookup_parse("nodes")?.unwrap_or(0);
                    let leader = args.lookup_parse("leader")?;
                    if let Some(heartbeat_interval) = args.lookup_parse("heartbeat_interval")? {
                        opts.heartbeat_interval = heartbeat_interval;
                    };
                    if let Some(election_timeout) = args.lookup_parse("election_timeout")? {
                        opts.election_timeout_range = election_timeout..election_timeout + 1;
                    }
                    if let Some(max_append_entries) = args.lookup_parse("max_append_entries")? {
                        opts.max_append_entries = max_append_entries;
                    }
                    args.reject_rest()?;
                    self.cluster(nodes, leader, opts, &mut output)?;
                }

                // deliver [from=ID] [ID...]
                // Delivers (steps) pending messages to the given nodes. If from
                // is given, only messages from the given node is delivered, the
                // others are left pending.
                "deliver" => {
                    let mut args = command.consume_args();
                    let from = args.lookup_parse("from")?;
                    let ids = self.parse_ids_or_all(&args.rest())?;
                    self.deliver(&ids, from, &mut output)?;
                }

                // get ID KEY
                // Sends a client request to the given node to read the given
                // key from the state machine (key/value store).
                "get" => {
                    let mut args = command.consume_args();
                    let id = args.next_pos().ok_or("must specify node ID")?.parse()?;
                    let key = args.next_pos().ok_or("must specify key")?.value.clone();
                    args.reject_rest()?;
                    let request = Request::Read(KVCommand::Get { key }.encode());
                    self.request(id, request, &mut output)?;
                }

                // heal [ID...]
                // Heals all network partitions for the given nodes.
                "heal" => {
                    let ids = self.parse_ids_or_all(&command.args)?;
                    self.heal(&ids, &mut output)?;
                }

                // heartbeat ID...
                // Sends a heartbeat from the given leader nodes.
                "heartbeat" => {
                    let ids = self.parse_ids_or_all(&command.args)?;
                    self.heartbeat(&ids, &mut output)?;
                }

                // log [ID...]
                // Outputs the current Raft log for the given nodes.
                "log" => {
                    let ids = self.parse_ids_or_all(&command.args)?;
                    self.log(&ids, &mut output)?;
                }

                // partition ID...
                // Partitions the given nodes away from the rest of the cluster.
                // They can still communicate with each other, unless they were
                // previously partitioned.
                "partition" => {
                    let ids = self.parse_ids_or_error(&command.args)?;
                    self.partition(&ids, &mut output)?;
                }

                // put ID KEY=VALUE
                // Sends a client request to the given node to write a key/value
                // pair to the state machine (key/value store).
                "put" => {
                    let mut args = command.consume_args();
                    let id = args.next_pos().ok_or("must specify node ID")?.parse()?;
                    let kv = args.next_key().ok_or("must specify key/value pair")?.clone();
                    let (key, value) = (kv.key.unwrap(), kv.value);
                    args.reject_rest()?;
                    let request = Request::Write(KVCommand::Put { key, value }.encode());
                    self.request(id, request, &mut output)?;
                }

                // restart [commit_index=INDEX] [applied_index=INDEX] [ID...]
                // Restarts the given nodes (or all nodes). They retain their
                // log and state, unless applied_index is given (which reverts
                // the state machine to the given index, or 0 if empty).
                // commit_index may be given to regress the commit index (it
                // is not flushed to durable storage).
                "restart" => {
                    let mut args = command.consume_args();
                    let applied_index = args.lookup_parse("applied_index")?;
                    let commit_index = args.lookup_parse("commit_index")?;
                    let ids = self.parse_ids_or_all(&args.rest())?;
                    self.restart(&ids, commit_index, applied_index, &mut output)?;
                }

                // stabilize [heartbeat=BOOL] [ID...]
                // Stabilizes the given nodes by repeatedly delivering messages
                // until no more messages are pending. If heartbeat is true, also
                // emits a heartbeat from the leader and restabilizes, e.g. to
                // propagate the commit index.
                "stabilize" => {
                    let mut args = command.consume_args();
                    let heartbeat = args.lookup_parse("heartbeat")?.unwrap_or(false);
                    let ids = self.parse_ids_or_all(&args.rest())?;
                    self.stabilize(&ids, heartbeat, &mut output)?;
                }

                // state [ID...]
                // Prints the current state machine contents on the given nodes.
                "state" => {
                    let ids = self.parse_ids_or_all(&command.args)?;
                    self.state(&ids, &mut output)?;
                }

                // status [request=BOOL] [ID...]
                // Prints the current node status of the given nodes. If request
                // is true, sends a status client request to a single node,
                // otherwise fetches status directly from each node.
                "status" => {
                    let mut args = command.consume_args();
                    let request = args.lookup_parse("request")?.unwrap_or(false);
                    let ids = self.parse_ids_or_all(&args.rest())?;
                    if request {
                        let [id] = *ids.as_slice() else {
                            return Err("request=true requires 1 node ID".into());
                        };
                        self.request(id, Request::Status, &mut output)?;
                    } else {
                        self.status(&ids, &mut output)?;
                    }
                }

                // step ID JSON
                // Steps a manually generated JSON message on the given node.
                "step" => {
                    let mut args = command.consume_args();
                    let id = args.next_pos().ok_or("node ID not given")?.parse()?;
                    let raw = &args.next_pos().ok_or("message not given")?.value;
                    let msg = serde_json::from_str(raw)?;
                    args.reject_rest()?;
                    self.transition(id, |n| n.step(msg), &mut output)?;
                }

                // tick [ID...]
                // Ticks the given nodes.
                "tick" => {
                    let ids = self.parse_ids_or_all(&command.args)?;
                    for id in ids {
                        self.transition(id, |n| n.tick(), &mut output)?;
                    }
                }

                name => return Err(format!("unknown command {name}").into()),
            }
            Ok(output)
        }
    }

    impl TestRunner {
        fn new() -> Self {
            Self {
                ids: Vec::new(),
                nodes: HashMap::new(),
                nodes_rx: HashMap::new(),
                nodes_pending: HashMap::new(),
                applied_rx: HashMap::new(),
                disconnected: HashMap::new(),
                requests: HashMap::new(),
                next_request_id: 1,
                tempdir: TempDir::with_prefix("toydb").expect("tempdir failed"),
            }
        }

        /// Creates a new empty node and inserts it.
        fn add_node(
            &mut self,
            id: NodeID,
            peers: HashSet<NodeID>,
            opts: Options,
        ) -> Result<(), Box<dyn Error>> {
            // Use both a BitCask and a Memory engine, and mirror operations
            // across them, for added engine test coverage.
            let path = self.tempdir.path().join(format!("{id}.log"));
            let bitcask = storage::BitCask::new(path).expect("bitcask failed");
            let memory = storage::Memory::new();
            let engine = testengine::Mirror::new(bitcask, memory);
            let log = Log::new(Box::new(engine))?;
            let state = teststate::KV::new();
            self.add_node_with(id, peers, log, state, opts)
        }

        /// Creates a new node with the given log and state and inserts it.
        fn add_node_with(
            &mut self,
            id: NodeID,
            peers: HashSet<NodeID>,
            log: Log,
            state: Box<dyn State>,
            opts: Options,
        ) -> Result<(), Box<dyn Error>> {
            let (node_tx, node_rx) = crossbeam::channel::unbounded();
            let (applied_tx, applied_rx) = crossbeam::channel::unbounded();
            let state = teststate::Emit::new(state, applied_tx);
            self.nodes.insert(id, Node::new(id, peers, log, state, node_tx, opts)?);
            self.nodes_rx.insert(id, node_rx);
            self.nodes_pending.insert(id, Vec::new());
            self.applied_rx.insert(id, applied_rx);
            self.disconnected.insert(id, HashSet::new());
            Ok(())
        }

        /// Transitions nodes to candidates and campaign in a new term.
        fn campaign(&mut self, ids: &[NodeID], output: &mut String) -> Result<(), Box<dyn Error>> {
            let campaign = |node| match node {
                Node::Candidate(mut node) => {
                    node.campaign()?;
                    Ok(node.into())
                }
                Node::Follower(node) => Ok(node.into_candidate()?.into()),
                Node::Leader(node) => {
                    let term = node.term();
                    Ok(node.into_follower(term + 1)?.into_candidate()?.into())
                }
            };
            for id in ids.iter().copied() {
                self.transition(id, campaign, output)?;
            }
            Ok(())
        }

        /// Creates a Raft cluster.
        fn cluster(
            &mut self,
            nodes: u8,
            leader: Option<NodeID>,
            opts: Options,
            output: &mut String,
        ) -> Result<(), Box<dyn Error>> {
            if !self.ids.is_empty() {
                return Err("cluster already exists".into());
            }
            if nodes == 0 {
                return Err("cluster can't have 0 nodes".into());
            }

            self.ids = (1..=nodes).collect();

            for id in self.ids.clone() {
                let peers = self.ids.iter().copied().filter(|i| i != &id).collect();
                self.add_node(id, peers, opts.clone())?;
            }

            // Promote leader if requested. Suppress output.
            if let Some(id) = leader {
                let quiet = &mut String::new();
                let Some(Node::Follower(node)) = self.nodes.remove(&id) else {
                    return Err(format!("invalid leader {id}").into());
                };
                self.nodes.insert(id, node.into_candidate()?.into_leader()?.into());
                self.receive(id, quiet)?;
                self.stabilize(&self.ids.clone(), true, quiet)?;
            }

            // Drain any initial applied entries.
            for applied_rx in self.applied_rx.values_mut() {
                while applied_rx.try_recv().is_ok() {}
            }

            // Output final cluster status.
            self.status(&self.ids, output)
        }

        /// Delivers pending messages to the given nodes. If from is given, only
        /// delivers messages from that node. Returns the number of delivered
        /// messages.
        fn deliver(
            &mut self,
            ids: &[NodeID],
            from: Option<NodeID>,
            output: &mut String,
        ) -> Result<usize, Box<dyn Error>> {
            // Take a snapshot of the pending queues before delivering any
            // messages. This avoids outbound messages in response to delivery
            // being delivered to higher node IDs in the same loop, which can
            // give unintuitive results.
            let mut step = Vec::new();
            for id in ids.iter().copied() {
                let Some(pending) = self.nodes_pending.remove(&id) else {
                    return Err(format!("unknown node {id}").into());
                };
                let (deliver, requeue) =
                    pending.into_iter().partition(|msg| from.is_none() || from == Some(msg.from));
                self.nodes_pending.insert(id, requeue);
                step.extend(deliver);
            }

            let delivered = step.len();
            for msg in step {
                self.transition(msg.to, |node| node.step(msg), output)?;
            }
            Ok(delivered)
        }

        /// Heals the given partitioned nodes, restoring connectivity with all
        /// other nodes.
        fn heal(&mut self, ids: &[NodeID], output: &mut String) -> Result<(), Box<dyn Error>> {
            for id in ids.iter().copied() {
                self.disconnected.insert(id, HashSet::new());
                for peers in self.disconnected.values_mut() {
                    peers.remove(&id);
                }
            }
            output.push_str(&Self::format_disconnected(&self.disconnected));
            Ok(())
        }

        /// Emits a heartbeat from the given leader nodes.
        fn heartbeat(&mut self, ids: &[NodeID], output: &mut String) -> Result<(), Box<dyn Error>> {
            for id in ids.iter().copied() {
                let Some(Node::Leader(leader)) = self.nodes.get_mut(&id) else {
                    return Err(format!("{id} is not a leader").into());
                };
                leader.heartbeat()?;
                self.receive(id, output)?;
            }
            Ok(())
        }

        /// Outputs the current log contents for the given nodes.
        fn log(&mut self, ids: &[NodeID], output: &mut String) -> Result<(), Box<dyn Error>> {
            for id in ids {
                let node = self.nodes.get_mut(id).ok_or(format!("unknown node {id}"))?;
                let nodefmt = Self::format_node(node);
                let (last_index, last_term) = node.get_last_index();
                let (commit_index, commit_term) = node.get_commit_index();
                let (term, vote) = node.get_term_vote();
                writeln!(
                    output,
                    "{nodefmt} term={term} last={last_index}@{last_term} commit={commit_index}@{commit_term} vote={vote:?}",
                )?;
                for entry in node.scan_log()? {
                    writeln!(output, "{nodefmt} entry {}", Self::format_entry(&entry))?;
                }
            }
            Ok(())
        }

        /// Partitions the given nodes from all other nodes in the cluster
        /// (bidirectionally). The given nodes can communicate with each other
        /// unless they were previously partitioned.
        fn partition(&mut self, ids: &[NodeID], output: &mut String) -> Result<(), Box<dyn Error>> {
            let ids = HashSet::<NodeID>::from_iter(ids.iter().copied());
            for id in ids.iter().copied() {
                for peer in self.ids.iter().copied().filter(|p| !ids.contains(p)) {
                    self.disconnected.entry(id).or_default().insert(peer);
                    self.disconnected.entry(peer).or_default().insert(id);
                }
            }
            output.push_str(&Self::format_disconnected(&self.disconnected));
            Ok(())
        }

        /// Receives outbound messages from a node, prints them, and queues them
        /// for delivery. Returns the number of received messages.
        fn receive(&mut self, id: NodeID, output: &mut String) -> Result<u32, Box<dyn Error>> {
            let rx = self.nodes_rx.get_mut(&id).ok_or(format!("unknown node {id}"))?;
            let mut count = 0;
            for msg in rx.try_iter() {
                count += 1;
                let (from, term, to) = (msg.from, msg.term, msg.to); // simplify formatting
                let msgfmt = Self::format_message(&msg.message);

                // If the peer is disconnected, drop the message and output it.
                if self.disconnected[&msg.from].contains(&msg.to) {
                    writeln!(
                        output,
                        "n{from}@{term} ⇥ n{to} {}",
                        Self::format_strikethrough(&msgfmt),
                    )?;
                    continue;
                }

                // Intercept and output client responses.
                if msg.from == msg.to {
                    let Message::ClientResponse { id, response } = &msg.message else {
                        return Err(format!("invalid self-addressed message: {msg:?}").into());
                    };
                    writeln!(output, "n{from}@{term} → c{to} {msgfmt}")?;
                    let request = &self.requests.remove(id).ok_or("unknown request id")?;
                    writeln!(
                        output,
                        "c{to}@{term} {} ⇒ {}",
                        Self::format_request(request),
                        Self::format_response(response),
                    )?;
                    continue;
                }

                // Output the message and queue it for delivery.
                writeln!(output, "n{from}@{term} → n{to} {msgfmt}")?;
                self.nodes_pending.get_mut(&msg.to).ok_or(format!("unknown node {to}"))?.push(msg);
            }
            Ok(count)
        }

        /// Submits a client request via the given node.
        fn request(
            &mut self,
            id: NodeID,
            request: Request,
            output: &mut String,
        ) -> Result<(), Box<dyn Error>> {
            let request_id = Uuid::from_u64_pair(0, self.next_request_id);
            self.next_request_id += 1;
            self.requests.insert(request_id, request.clone());

            let term = self.nodes.get(&id).ok_or(format!("unknown node {id}"))?.term();
            let msg = Envelope {
                from: id,
                to: id,
                term,
                message: Message::ClientRequest { id: request_id, request },
            };
            writeln!(output, "c{id}@{term} → n{id} {}", Self::format_message(&msg.message))?;
            self.transition(id, |n| n.step(msg), output)
        }

        /// Restarts the given nodes. If commit_index or applied_index are
        /// given, the log commit index or state machine will regress.
        fn restart(
            &mut self,
            ids: &[NodeID],
            commit_index: Option<Index>,
            applied_index: Option<Index>,
            output: &mut String,
        ) -> Result<(), Box<dyn Error>> {
            for id in ids.iter().copied() {
                let node = self.nodes.remove(&id).ok_or(format!("unknown node {id}"))?;
                let peers = node.peers();
                let opts = node.options();
                let (log, mut state) = node.dismantle();
                let mut log = Log::new(log.engine)?; // reset log

                // If requested, regress the commit index.
                if let Some(commit_index) = commit_index {
                    if commit_index > log.get_commit_index().0 {
                        return Err(format!("commit_index={commit_index} beyond current").into());
                    }
                    let commit_term = match log.get(commit_index)? {
                        Some(e) => e.term,
                        None if commit_index == 0 => 0,
                        None => return Err(format!("unknown commit_index={commit_index}").into()),
                    };
                    log.engine.set(
                        &crate::raft::log::Key::CommitIndex.encode(),
                        bincode::serialize(&(commit_index, commit_term)),
                    )?;
                    // Reset the log again.
                    log = Log::new(log.engine)?;
                }

                // If requested, wipe the state machine and reapply up to the
                // requested applied index.
                if let Some(applied_index) = applied_index {
                    if applied_index > log.get_commit_index().0 {
                        return Err(format!("applied_index={applied_index} beyond commit").into());
                    }
                    state = teststate::KV::new();
                    let mut scan = log.scan(..=applied_index);
                    while let Some(entry) = scan.next().transpose()? {
                        _ = state.apply(entry); // apply errors are returned to client
                    }
                    assert_eq!(state.get_applied_index(), applied_index, "wrong applied index");
                }

                // Add node, and run a noop transition to output applied entries.
                self.add_node_with(id, peers, log, state, opts)?;
                self.transition(id, Ok, output)?;
            }
            // Output restarted node status.
            self.status(ids, output)
        }

        /// Stabilizes the given nodes by repeatedly delivering pending messages
        /// until no new messages are generated. If heartbeat is true, leaders
        /// then emit a heartbeat and restabilize again, e.g. to propagate the
        /// commit index.
        fn stabilize(
            &mut self,
            ids: &[NodeID],
            heartbeat: bool,
            output: &mut String,
        ) -> Result<(), Box<dyn Error>> {
            while self.deliver(ids, None, output)? > 0 {}
            // If requested, heartbeat the current leader (with the highest
            // term) and re-stabilize the nodes.
            if heartbeat {
                let leader = self
                    .nodes
                    .values()
                    .sorted_by_key(|n| n.term())
                    .rev()
                    .find(|n| matches!(n, Node::Leader(_)));
                if let Some(leader) = leader {
                    self.heartbeat(&[leader.id()], output)?;
                    self.stabilize(ids, false, output)?;
                }
            }
            Ok(())
        }

        /// Outputs the current state machine for the given nodes.
        fn state(&mut self, ids: &[NodeID], output: &mut String) -> Result<(), Box<dyn Error>> {
            for id in ids {
                let node = self.nodes.get_mut(id).ok_or(format!("unknown node {id}"))?;
                let nodefmt = Self::format_node(node);
                let applied_index = node.get_applied_index();
                let raw = node.read(KVCommand::Scan.encode())?;
                let KVResponse::Scan(kvs) = KVResponse::decode(&raw)? else {
                    return Err("unexpected scan response".into());
                };
                writeln!(output, "{nodefmt} applied={applied_index}")?;
                for (key, value) in kvs {
                    writeln!(output, "{nodefmt} state {key}={value}")?;
                }
            }
            Ok(())
        }

        /// Outputs status for the given nodes.
        fn status(&self, ids: &[NodeID], output: &mut String) -> Result<(), Box<dyn Error>> {
            for id in ids {
                let node = self.nodes.get(id).ok_or(format!("unknown node {id}"))?;
                let (last_index, last_term) = node.get_last_index();
                let (commit_index, commit_term) = node.get_commit_index();
                let applied_index = node.get_applied_index();
                write!(
                    output,
                    "{node} last={last_index}@{last_term} commit={commit_index}@{commit_term} applied={applied_index}",
                    node = Self::format_node_role(node)
                )?;
                if let Node::Leader(leader) = node {
                    let progress = leader
                        .role
                        .progress
                        .iter()
                        .sorted_by_key(|(id, _)| *id)
                        .map(|(id, pr)| format!("{id}:{}→{}", pr.match_index, pr.next_index))
                        .join(" ");
                    write!(output, " progress={{{progress}}}")?
                }
                output.push('\n');
            }
            Ok(())
        }

        /// Applies a node transition (typically a step or tick), and outputs
        /// relevant changes.
        fn transition(
            &mut self,
            id: NodeID,
            f: impl FnOnce(Node) -> crate::error::Result<Node>,
            output: &mut String,
        ) -> Result<(), Box<dyn Error>> {
            let mut node = self.nodes.remove(&id).ok_or(format!("unknown node {id}"))?;

            // Fetch pre-transition info.
            let old_noderole = Self::format_node_role(&node);
            let (old_commit_index, _) = node.get_commit_index();
            let mut old_entries = node.scan_log()?.into_iter();

            // Apply the transition.
            node = f(node)?;

            // Fetch post-transition info.
            let nodefmt = Self::format_node(&node);
            let noderole = Self::format_node_role(&node);
            let (commit_index, commit_term) = node.get_commit_index();

            let entries = node.scan_log()?.into_iter();
            let appended: Vec<Entry> = entries
                .skip_while(|e| Some(e.term) == old_entries.next().map(|e| e.term))
                .collect();

            self.nodes.insert(id, node);

            // Output relevant changes.
            if old_noderole != noderole {
                writeln!(output, "{old_noderole} ⇨ {noderole}")?
            }
            for entry in appended {
                writeln!(output, "{nodefmt} append {}", Self::format_entry(&entry))?
            }
            if old_commit_index != commit_index {
                writeln!(output, "{nodefmt} commit {commit_index}@{commit_term}")?;
            }
            for entry in self.applied_rx[&id].try_iter() {
                writeln!(output, "{nodefmt} apply {}", Self::format_entry(&entry))?
            }

            // Receive any outbound messages.
            self.receive(id, output)?;
            Ok(())
        }

        /// Parses node IDs from the given argument values. Errors on key/value
        /// arguments. Can take both [Argument] and [&Argument].
        fn parse_ids<A>(&self, args: &[A]) -> Result<Vec<NodeID>, Box<dyn Error>>
        where
            A: Borrow<goldenscript::Argument>,
        {
            let mut ids = Vec::new();
            for arg in args.iter().map(|a| a.borrow()) {
                if let Some(key) = &arg.key {
                    return Err(format!("unknown argument '{key}'").into());
                }
                let id = arg.parse()?;
                if !self.nodes.contains_key(&id) {
                    return Err(format!("unknown node {id}").into());
                }
                ids.push(id)
            }
            Ok(ids)
        }

        // Parses node IDs from the given argument values, or returns all node
        // IDs if none were given.
        fn parse_ids_or_all<A>(&self, args: &[A]) -> Result<Vec<NodeID>, Box<dyn Error>>
        where
            A: Borrow<goldenscript::Argument>,
        {
            let ids = self.parse_ids(args)?;
            if ids.is_empty() {
                return Ok(self.ids.clone());
            }
            Ok(ids)
        }

        // Parses node IDs from the given argument values, or errors if none.
        fn parse_ids_or_error<A>(&self, args: &[A]) -> Result<Vec<NodeID>, Box<dyn Error>>
        where
            A: Borrow<goldenscript::Argument>,
        {
            let ids = self.parse_ids(args)?;
            if ids.is_empty() {
                return Err("node ID not given".into());
            }
            Ok(ids)
        }

        /// Formats network partitions.
        fn format_disconnected(disconnected: &HashMap<NodeID, HashSet<NodeID>>) -> String {
            // Return early if the cluster is fully connected.
            if disconnected.iter().all(|(_, peers)| peers.is_empty()) {
                return format!(
                    "{} fully connected\n",
                    disconnected.keys().sorted().map(|id| format!("n{id}")).join(" ")
                );
            }

            let mut output = String::new();

            // Separate symmetric and asymmetric partitions.
            let mut symmetric: HashMap<NodeID, HashSet<NodeID>> = HashMap::new();
            let mut asymmetric: HashMap<NodeID, HashSet<NodeID>> = HashMap::new();
            for (id, peers) in disconnected {
                for peer in peers {
                    if disconnected[peer].contains(id) {
                        symmetric.entry(*id).or_default().insert(*peer);
                    } else {
                        asymmetric.entry(*id).or_default().insert(*peer);
                    }
                }
            }

            // Anchor the symmetric partitions at the node with the largest number
            // of disconnects, otherwise the smallest (first) ID.
            for (id, peers) in &symmetric.clone() {
                for peer in peers {
                    // Recompute the peer set sizes for each iteration, since we
                    // modify the peer set below.
                    let len = symmetric.get(id).map(|p| p.len()).unwrap_or(0);
                    let peer_len = symmetric.get(peer).map(|p| p.len()).unwrap_or(0);
                    // If this peer set is the smallest (or we're the higher ID),
                    // remove the entry. We may no longer be in the map.
                    if (len < peer_len || len == peer_len && id > peer)
                        && let Some(peers) = symmetric.get_mut(id)
                    {
                        peers.remove(peer);
                        if peers.is_empty() {
                            symmetric.remove(id);
                        }
                    }
                }
            }

            // The values (HashSets) correspond to the RHS of a partition. Let's
            // group the LHS of the partition as well, from smallest to largest,
            // separately for symmetric and asymmetric partitions. The vector
            // contains (LHS, RHS, symmetric) groupings for each partition.
            let mut grouped: Vec<(HashSet<NodeID>, HashSet<NodeID>, bool)> = Vec::new();
            for (id, peers, symm) in symmetric
                .into_iter()
                .map(|(i, p)| (i, p, true))
                .chain(asymmetric.into_iter().map(|(i, p)| (i, p, false)))
                .sorted_by_key(|(id, _, symm)| (*id, !symm))
            {
                // Look for an existing LHS group with the same RHS, and insert
                // this node into it. Otherwise, create a new LHS group.
                match grouped.iter_mut().find(|(_, rhs, s)| peers == *rhs && symm == *s) {
                    Some((lhs, _, _)) => _ = lhs.insert(id),
                    None => grouped.push((HashSet::from([id]), peers, symm)),
                }
            }

            // Display the groups.
            for (lhs, rhs, symm) in grouped {
                let lhs = lhs.iter().sorted().map(|id| format!("n{id}")).join(" ");
                let sep = if symm { '⇹' } else { '⇥' };
                let rhs = rhs.iter().sorted().map(|id| format!("n{id}")).join(" ");
                writeln!(output, "{lhs} {sep} {rhs}").unwrap();
            }

            output
        }

        /// Formats an entry.
        fn format_entry(entry: &Entry) -> String {
            let command = match entry.command.as_ref() {
                Some(raw) => KVCommand::decode(raw).expect("invalid command").to_string(),
                None => "None".to_string(),
            };
            format!("{index}@{term} {command}", index = entry.index, term = entry.term)
        }

        /// Formats a message.
        fn format_message(msg: &Message) -> String {
            match msg {
                Message::Campaign { last_index, last_term } => {
                    format!("Campaign last={last_index}@{last_term}")
                }
                Message::CampaignResponse { vote } => {
                    format!("CampaignResponse vote={vote}")
                }
                Message::Heartbeat { last_index, commit_index, read_seq } => {
                    format!(
                        "Heartbeat last_index={last_index} commit_index={commit_index} read_seq={read_seq}"
                    )
                }
                Message::HeartbeatResponse { match_index, read_seq } => {
                    format!("HeartbeatResponse match_index={match_index} read_seq={read_seq}")
                }
                Message::Append { base_index, base_term, entries } => {
                    let ent = entries.iter().map(|e| format!("{}@{}", e.index, e.term)).join(" ");
                    format!("Append base={base_index}@{base_term} [{ent}]")
                }
                Message::AppendResponse { match_index, reject_index } => {
                    match (match_index, reject_index) {
                        (0, 0) => panic!("match_index and reject_index both 0"),
                        (match_index, 0) => format!("AppendResponse match_index={match_index}"),
                        (0, reject_index) => format!("AppendResponse reject_index={reject_index}"),
                        (_, _) => panic!("match_index and reject_index both set"),
                    }
                }
                Message::Read { seq } => {
                    format!("Read seq={seq}")
                }
                Message::ReadResponse { seq } => {
                    format!("ReadResponse seq={seq}")
                }
                Message::ClientRequest { id, request } => {
                    format!(
                        "ClientRequest id=0x{} {}",
                        hex::encode(id).trim_start_matches("00"),
                        match request {
                            Request::Read(v) => format!("read 0x{}", hex::encode(v)),
                            Request::Write(v) => format!("write 0x{}", hex::encode(v)),
                            Request::Status => "status".to_string(),
                        }
                    )
                }
                Message::ClientResponse { id, response } => {
                    format!(
                        "ClientResponse id=0x{} {}",
                        hex::encode(id).trim_start_matches("00"),
                        match response {
                            Ok(Response::Read(v)) => format!("read 0x{}", hex::encode(v)),
                            Ok(Response::Write(v)) => format!("write 0x{}", hex::encode(v)),
                            Ok(Response::Status(v)) => format!("status {v:?}"),
                            Err(error) => format!("Error::{error:#?}"),
                        }
                    )
                }
            }
        }

        /// Formats a node identifier.
        fn format_node(node: &Node) -> String {
            format!("n{id}@{term}", id = node.id(), term = node.term())
        }

        /// Formats a node identifier with role.
        fn format_node_role(node: &Node) -> String {
            let role = match node {
                Node::Candidate(_) => "candidate".to_string(),
                Node::Follower(node) => {
                    let leader = node.role.leader.map(|id| format!("n{id}")).unwrap_or_default();
                    format!("follower({leader})")
                }
                Node::Leader(_) => "leader".to_string(),
            };
            format!("{node} {role}", node = Self::format_node(node))
        }

        /// Formats a request.
        fn format_request(request: &Request) -> String {
            match request {
                Request::Read(c) | Request::Write(c) => KVCommand::decode(c).unwrap().to_string(),
                Request::Status => "status".to_string(),
            }
        }

        /// Formats a response.
        fn format_response(response: &crate::error::Result<Response>) -> String {
            match response {
                Ok(Response::Read(r) | Response::Write(r)) => {
                    KVResponse::decode(r).unwrap().to_string()
                }
                Ok(Response::Status(status)) => format!("{status:#?}"),
                Err(error) => format!("Error::{error:?} ({error})"),
            }
        }

        /// Strike-through formats the given string using a Unicode combining stroke.
        fn format_strikethrough(s: &str) -> String {
            s.chars().flat_map(|c| [c, '\u{0336}']).collect()
        }
    }
}


================================================
FILE: src/raft/state.rs
================================================
use super::{Entry, Index};
use crate::error::Result;

/// A Raft-managed state machine. Raft itself does not care what the state
/// machine is, nor what the commands and results do -- it will simply apply
/// arbitrary binary commands sequentially from the Raft log, returning an
/// arbitrary binary result to the client.
///
/// Since commands are applied identically across all nodes, they must be
/// deterministic and yield the same state and result across all nodes too.
/// Otherwise, the nodes will diverge, such that different nodes will produce
/// different results.
///
/// Write commands (`Request::Write`) are replicated and applied on all nodes
/// via `State::apply`. The state machine must keep track of the last applied
/// index and return it via `State::get_applied_index`. Read commands
/// (`Request::Read`) are only executed on a single node via `State::read` and
/// must not make any state changes.
pub trait State: Send {
    /// Returns the last applied log index from the state machine.
    ///
    /// This must correspond to the current state of the state machine, since it
    /// determines which command to apply next. In particular, a node crash may
    /// result in partial command application or data loss, which must be
    /// handled appropriately.
    fn get_applied_index(&self) -> Index;

    /// Applies a log entry to the state machine, returning a client result.
    /// Errors are considered applied and propagated back to the client.
    ///
    /// This is executed on all nodes, so the result must be deterministic: it
    /// must yield the same state and result on all nodes, even if the command
    /// is reapplied following a node crash.
    ///
    /// Any non-deterministic apply error (e.g. an IO error) must panic and
    /// crash the node -- if it instead returns an error to the client, the
    /// command is considered applied and node states will diverge. The state
    /// machine is responsible for panicing when appropriate.
    ///
    /// The entry may contain a noop command, which is committed by Raft during
    /// leader changes. This still needs to be applied to the state machine to
    /// properly update the applied index, and should return an empty result.
    fn apply(&mut self, entry: Entry) -> Result<Vec<u8>>;

    /// Executes a read command in the state machine, returning a client result.
    /// Errors are also propagated back to the client.
    ///
    /// This is only executed on a single node, so it must not result in any
    /// state changes (i.e. it must not write).
    fn read(&self, command: Vec<u8>) -> Result<Vec<u8>>;
}

/// Test helper state machines.
#[cfg(test)]
pub mod test {
    use std::collections::BTreeMap;
    use std::fmt::Display;

    use crossbeam::channel::Sender;
    use itertools::Itertools as _;
    use serde::{Deserialize, Serialize};

    use super::*;
    use crate::encoding::{self, Value as _};

    /// Wraps a state machine and emits applied entries to the provided channel.
    pub struct Emit {
        inner: Box<dyn State>,
        tx: Sender<Entry>,
    }

    impl Emit {
        pub fn new(inner: Box<dyn State>, tx: Sender<Entry>) -> Box<Self> {
            Box::new(Self { inner, tx })
        }
    }

    impl State for Emit {
        fn get_applied_index(&self) -> Index {
            self.inner.get_applied_index()
        }

        fn apply(&mut self, entry: Entry) -> Result<Vec<u8>> {
            let response = self.inner.apply(entry.clone())?;
            self.tx.send(entry)?;
            Ok(response)
        }

        fn read(&self, command: Vec<u8>) -> Result<Vec<u8>> {
            self.inner.read(command)
        }
    }

    /// A simple string key/value store. Takes KVCommands.
    pub struct KV {
        applied_index: Index,
        data: BTreeMap<String, String>,
    }

    impl KV {
        pub fn new() -> Box<Self> {
            Box::new(Self { applied_index: 0, data: BTreeMap::new() })
        }
    }

    impl State for KV {
        fn get_applied_index(&self) -> Index {
            self.applied_index
        }

        fn apply(&mut self, entry: Entry) -> Result<Vec<u8>> {
            let command = entry.command.as_deref().map(KVCommand::decode).transpose()?;
            let response = match command {
                Some(KVCommand::Put { key, value }) => {
                    self.data.insert(key, value);
                    KVResponse::Put(entry.index).encode()
                }
                Some(c @ (KVCommand::Get { .. } | KVCommand::Scan)) => {
                    panic!("{c} submitted as write command")
                }
                None => Vec::new(),
            };
            self.applied_index = entry.index;
            Ok(response)
        }

        fn read(&self, command: Vec<u8>) -> Result<Vec<u8>> {
            match KVCommand::decode(&command)? {
                KVCommand::Get { key } => {
                    Ok(KVResponse::Get(self.data.get(&key).cloned()).encode())
                }
                KVCommand::Scan => Ok(KVResponse::Scan(self.data.clone()).encode()),
                c @ KVCommand::Put { .. } => panic!("{c} submitted as read command"),
            }
        }
    }

    /// A KV command. Returns the corresponding KVResponse.
    #[derive(Serialize, Deserialize)]
    pub enum KVCommand {
        /// Fetches the value of the given key.
        Get { key: String },
        /// Stores the given key/value pair, returning the applied index.
        Put { key: String, value: String },
        /// Returns all key/value pairs.
        Scan,
    }

    impl encoding::Value for KVCommand {}

    impl Display for KVCommand {
        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
            match self {
                Self::Get { key } => write!(f, "get {key}"),
                Self::Put { key, value } => write!(f, "put {key}={value}"),
                Self::Scan => write!(f, "scan"),
            }
        }
    }

    /// A KVCommand response.
    #[derive(Serialize, Deserialize)]
    pub enum KVResponse {
        /// Get returns the key's value, or None if it does not exist.
        Get(Option<String>),
        /// Put returns the applied index of the command.
        Put(Index),
        /// Scan returns the key/value pairs.
        Scan(BTreeMap<String, String>),
    }

    impl encoding::Value for KVResponse {}

    impl Display for KVResponse {
        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
            match self {
                Self::Get(Some(value)) => write!(f, "{value}"),
                Self::Get(None) => write!(f, "None"),
                Self::Put(applied_index) => write!(f, "{applied_index}"),
                Self::Scan(kvs) => {
                    write!(f, "{}", kvs.iter().map(|(k, v)| format!("{k}={v}")).join(","))
                }
            }
        }
    }

    /// A state machine which does nothing. All commands are ignored.
    pub struct Noop {
        applied_index: Index,
    }

    impl Noop {
        pub fn new() -> Box<Self> {
            Box::new(Self { applied_index: 0 })
        }
    }

    impl State for Noop {
        fn get_applied_index(&self) -> Index {
            self.applied_index
        }

        fn apply(&mut self, entry: Entry) -> Result<Vec<u8>> {
            self.applied_index = entry.index;
            Ok(Vec::new())
        }

        fn read(&self, _: Vec<u8>) -> Result<Vec<u8>> {
            Ok(Vec::new())
        }
    }
}


================================================
FILE: src/raft/testscripts/log/append
================================================
# Appending an entry with term 0 fails.
!append foo
---
Panic: can't append entry in term 0

# Appending to an empty log works. The term doesn't have to be 1. The entry is
# written to the engine and flushed to durable storage.
set_term 2
append foo [ops]
---
append → 1@2 "foo"
engine set raft:Entry(1) → 1@2 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x02\x01\x03foo"]
engine flush

# Appending a noop entry (no command) also works.
append [ops]
---
append → 2@2 None
engine set raft:Entry(2) → 2@2 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x02\x00"]
engine flush

# Check that the last index/term is updated (commit index isn't), and that
# the engine contains the expected data, both in logical and raw form.
status
scan
dump
---
term=2 last=2@2 commit=0@0 vote=None
1@2 "foo"
2@2 None
raft:Entry(1) → 1@2 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x02\x01\x03foo"]
raft:Entry(2) → 2@2 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x02\x00"]
raft:TermVote → term=2 vote=None ["\x01" → "\x02\x00"]

# Skipping a term then appending is allowed.
set_term 3
append command
set_term 5
append
---
append → 3@3 "command"
append → 4@5 None

# Dump the final status and data.
status
scan
dump
---
term=5 last=4@5 commit=0@0 vote=None
1@2 "foo"
2@2 None
3@3 "command"
4@5 None
raft:Entry(1) → 1@2 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x02\x01\x03foo"]
raft:Entry(2) → 2@2 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x02\x00"]
raft:Entry(3) → 3@3 "command" ["\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x03\x03\x01\x07command"]
raft:Entry(4) → 4@5 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x04\x05\x00"]
raft:TermVote → term=5 vote=None ["\x01" → "\x05\x00"]


================================================
FILE: src/raft/testscripts/log/commit
================================================
# Committing fails on an empty engine.
!commit 1
---
Panic: commit index 1 does not exist

# Add some entries.
set_term 2
splice 1@1= 2@1=foo 3@2=bar
---
splice → 3@2 "bar"

# Committing entry 0 fails.
!commit 0
---
Panic: commit index 0 does not exist

# Committing entry 1 works, and updates the commit index.
#
# Show the engine operations too, and notice that the commit index isn't flushed
# to durable storage (it can be recovered from the durable quorum logs).
commit 1 [ops]
status
---
commit → 1@1 None
engine set raft:CommitIndex → 1@1 ["\x02" → "\x01\x01"]
term=2 last=3@2 commit=1@1 vote=None

# Dump the raw engine contents.
dump
---
raft:Entry(1) → 1@1 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x01\x00"]
raft:Entry(2) → 2@1 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x01\x01\x03foo"]
raft:Entry(3) → 3@2 "bar" ["\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x03\x02\x01\x03bar"]
raft:TermVote → term=2 vote=None ["\x01" → "\x02\x00"]
raft:CommitIndex → 1@1 ["\x02" → "\x01\x01"]

# Commits are idempotent, which doesn't incur an engine set.
commit 1 [ops]
status
---
commit → 1@1 None
term=2 last=3@2 commit=1@1 vote=None

# Commits can skip an entry.
commit 3
status
---
commit → 3@2 "bar"
term=2 last=3@2 commit=3@2 vote=None

# Commit regressions error.
!commit 2
status
---
Panic: commit index regression 3 → 2
term=2 last=3@2 commit=3@2 vote=None

# Committing non-existant indexes error.
!commit 4
status
---
Panic: commit index 4 does not exist
term=2 last=3@2 commit=3@2 vote=None

# Dump the raw values.
dump
---
raft:Entry(1) → 1@1 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x01\x00"]
raft:Entry(2) → 2@1 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x01\x01\x03foo"]
raft:Entry(3) → 3@2 "bar" ["\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x03\x02\x01\x03bar"]
raft:TermVote → term=2 vote=None ["\x01" → "\x02\x00"]
raft:CommitIndex → 3@2 ["\x02" → "\x03\x02"]


================================================
FILE: src/raft/testscripts/log/get
================================================
# get returns None on an empty engine.
get 1
---
None

# Append a few entries.
set_term 1
append
append foo
set_term 2
append bar
---
append → 1@1 None
append → 2@1 "foo"
append → 3@2 "bar"

# get returns noop entries and regular entries.
get 1 2
---
1@1 None
2@1 "foo"

# get returns None for missing entries, and for index 0.
get 4 0
---
None
None


================================================
FILE: src/raft/testscripts/log/has
================================================
# has returns false on an empty engine.
has 1@1
---
false

# Append a few entries.
set_term 1
append
append foo
set_term 2
append bar
---
append → 1@1 None
append → 2@1 "foo"
append → 3@2 "bar"

# has returns true both for noop entries and regular entries.
has 1@1 2@1
---
true
true

# has returns false for missing entries, including index 0.
has 4@2 0@0
---
false
false

# has returns false for term mismatches.
has 1@2 3@1 0@1
---
false
false
false


================================================
FILE: src/raft/testscripts/log/init
================================================
# Tests that the log correctly initializes cached state when opened.

set_term 1
---
ok

append foo
set_term 2 7
append bar
commit 1
---
append → 1@1 "foo"
append → 2@2 "bar"
commit → 1@1 "foo"

status
---
term=2 last=2@2 commit=1@1 vote=7

reload
---
ok

status
---
term=2 last=2@2 commit=1@1 vote=7

scan
---
1@1 "foo"
2@2 "bar"


================================================
FILE: src/raft/testscripts/log/scan
================================================
# scan works on an empty engine, even when given indexes.
scan
scan 3..7
---
ok

# Append a few entries.
set_term 1
append
append foo
set_term 2
append bar
---
append → 1@1 None
append → 2@1 "foo"
append → 3@2 "bar"

# Full scan.
scan
---
1@1 None
2@1 "foo"
3@2 "bar"

# Start bound.
scan 2..
---
2@1 "foo"
3@2 "bar"

scan 4..
---
ok

scan 0..
---
1@1 None
2@1 "foo"
3@2 "bar"

# End bound.
scan "..2"
---
1@1 None

scan "..=2"
---
1@1 None
2@1 "foo"

scan "..7"
---
1@1 None
2@1 "foo"
3@2 "bar"

scan "..1"
---
ok

scan "..0"
---
ok

# Both bounds.
scan 1..2
---
1@1 None

scan "1..=2"
---
1@1 None
2@1 "foo"

scan 0..7
---
1@1 None
2@1 "foo"
3@2 "bar"

scan 1..1
---
ok

# Bounds panics.
!scan 1..0
---
Panic: range start is greater than range end in BTreeMap

!scan 7..3
---
Panic: range start is greater than range end in BTreeMap


================================================
FILE: src/raft/testscripts/log/scan_apply
================================================
# scan_apply works on an empty engine, even when given an applied index.
scan_apply 0
scan_apply 3
---
ok

# Append a few entries.
set_term 1
append
append foo
set_term 2
append bar
---
append → 1@1 None
append → 2@1 "foo"
append → 3@2 "bar"

# Nothing is committed, so scan_applied yields nothing.
scan_apply 0
---
ok

# Commit the first two entries and apply them.
commit 2
scan_apply 0
---
commit → 2@1 "foo"
1@1 None
2@1 "foo"

# Passing the commit index yields nothing.
scan_apply 2
---
ok

# Passing an applied_index after the commit index is ok, and yields nothing.
scan_apply 3
scan_apply 10
---
ok

# Committing and applying the last entry works.
commit 3
scan_apply 2
---
commit → 3@2 "bar"
3@2 "bar"

# Scanning from a lower commit index again works.
scan_apply 1
---
2@1 "foo"
3@2 "bar"

scan_apply 0
---
1@1 None
2@1 "foo"
3@2 "bar"


================================================
FILE: src/raft/testscripts/log/splice
================================================
# Splicing at index 0 should fail.
!splice 0@1=foo
---
Panic: spliced entry has index or term 0

# Splicing without a term should fail.
!splice 1@1=foo
---
Panic: splice term 1 beyond current 0


# Splicing at index 2 should fail (creates gap).
set_term 1
!splice 2@1=foo
---
Panic: first index 2 must touch existing log

# Splicing entries at start should work, both with and without commands, and
# starting at a term after 1. They should be written to the engine and flushed
# to durable storage. It should also update the state.
set_term 2
splice 1@2= 2@2=command [ops]
status
scan
---
splice → 2@2 "command"
engine set raft:Entry(1) → 1@2 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x02\x00"]
engine set raft:Entry(2) → 2@2 "command" ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x02\x01\x07command"]
engine flush
term=2 last=2@2 commit=0@0 vote=None
1@2 None
2@2 "command"

# Splicing an empty list should work and be a noop.
splice [ops]
status
scan
---
splice → 2@2 "command"
term=2 last=2@2 commit=0@0 vote=None
1@2 None
2@2 "command"

# Splicing multiple duplicate entries should fail.
!splice 3@2= 3@2=
---
Panic: spliced entries are not contiguous

# Splicing entries with a gap should fail.
!splice 3@2= 5@2=
---
Panic: spliced entries are not contiguous

# Splicing entries with a term regression should fail.
!splice 3@2= 4@1=
---
Panic: spliced entries have term regression

# Splicing entries with a gap from the base should fail.
!splice 4@2=
---
Panic: first index 4 must touch existing log

# Splicing with a term regression from the base should fail.
!splice 3@1=
---
Panic: splice term regression 2 → 1

# Splicing with a term beyond the current term should fail.
!splice 3@3=
!splice 3@4=
---
Panic: splice term 3 beyond current 2
Panic: splice term 4 beyond current 2

# Fully overlapping entries is a noop.
splice 1@2= 2@2=command [ops]
scan
---
splice → 2@2 "command"
1@2 None
2@2 "command"

# An overlapping prefix is a noop.
splice 1@2= [ops]
scan
---
splice → 2@2 "command"
1@2 None
2@2 "command"

# An overlapping suffix is a noop.
splice 2@2=command [ops]
scan
---
splice → 2@2 "command"
1@2 None
2@2 "command"

# Changing a command with the same term/index should fail.
!splice 2@2=foo
scan
---
Panic: command mismatch at Entry { index: 2, term: 2, command: Some([99, 111, 109, 109, 97, 110, 100]) }
1@2 None
2@2 "command"

# Appending a new entry in the same term should work, as should
# appending one in a new term.
splice 3@2=bar
set_term 3
splice 4@3=
scan
---
splice → 3@2 "bar"
splice → 4@3 None
1@2 None
2@2 "command"
3@2 "bar"
4@3 None

# Splicing with suffix overlap should work, and only write the new entries.
splice 3@2=bar 4@3= 5@3=foo 6@3=bar [ops]
scan
---
splice → 6@3 "bar"
engine set raft:Entry(5) → 5@3 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x05\x03\x01\x03foo"]
engine set raft:Entry(6) → 6@3 "bar" ["\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x06\x03\x01\x03bar"]
engine flush
1@2 None
2@2 "command"
3@2 "bar"
4@3 None
5@3 "foo"
6@3 "bar"

# Splicing at an existing index with a new term should replace the tail.
set_term 4
splice 4@4= [ops]
status
scan
---
splice → 4@4 None
engine set raft:Entry(4) → 4@4 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x04\x04\x00"]
engine delete raft:Entry(5) ["\x00\x00\x00\x00\x00\x00\x00\x00\x05"]
engine delete raft:Entry(6) ["\x00\x00\x00\x00\x00\x00\x00\x00\x06"]
engine flush
term=4 last=4@4 commit=0@0 vote=None
1@2 None
2@2 "command"
3@2 "bar"
4@4 None

# This also holds at the start of the log.
set_term 5
splice 1@5= 2@5=foo 3@5=bar [ops]
status
scan
---
splice → 3@5 "bar"
engine set raft:Entry(1) → 1@5 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x05\x00"]
engine set raft:Entry(2) → 2@5 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x05\x01\x03foo"]
engine set raft:Entry(3) → 3@5 "bar" ["\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x03\x05\x01\x03bar"]
engine delete raft:Entry(4) ["\x00\x00\x00\x00\x00\x00\x00\x00\x04"]
engine flush
term=5 last=3@5 commit=0@0 vote=None
1@5 None
2@5 "foo"
3@5 "bar"

# Splicing across the commit index should work, as long as the entries match.
commit 2
splice 1@5= 2@5=foo 3@5=bar 4@5=
status
scan
---
commit → 2@5 "foo"
splice → 4@5 None
term=5 last=4@5 commit=2@5 vote=None
1@5 None
2@5 "foo"
3@5 "bar"
4@5 None

# Splicing across the commit index can replace a tail after the commit index.
set_term 9
splice 3@6= 4@6=bar
status
scan
---
splice → 4@6 "bar"
term=9 last=4@6 commit=2@5 vote=None
1@5 None
2@5 "foo"
3@6 None
4@6 "bar"

# But replacing a tail at or before the commit index should fail.
!splice 2@7=
!splice 1@7=
---
Panic: spliced entries below commit index
Panic: spliced entries below commit index

# Dump the raw data.
dump
---
raft:Entry(1) → 1@5 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x05\x00"]
raft:Entry(2) → 2@5 "foo" ["\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02\x05\x01\x03foo"]
raft:Entry(3) → 3@6 None ["\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x03\x06\x00"]
raft:Entry(4) → 4@6 "bar" ["\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x04\x06\x01\x03bar"]
raft:TermVote → term=9 vote=None ["\x01" → "\t\x00"]
raft:CommitIndex → 2@5 ["\x02" → "\x02\x05"]


================================================
FILE: src/raft/testscripts/log/status
================================================
# Status on empty engine works.
status engine=true
---
term=0 last=0@0 commit=0@0 vote=None engine=Status {
    name: "bitcask",
    keys: 0,
    size: 0,
    disk_size: 0,
    live_disk_size: 0,
}

# Write some data.
set_term 1
append
append foo
set_term 2 1
append bar
commit 2
---
append → 1@1 None
append → 2@1 "foo"
append → 3@2 "bar"
commit → 2@1 "foo"

# Status gives correct info.
status engine=true
---
term=2 last=3@2 commit=2@1 vote=1 engine=Status {
    name: "bitcask",
    keys: 5,
    size: 51,
    disk_size: 102,
    live_disk_size: 91,
}


================================================
FILE: src/raft/testscripts/log/term
================================================
# get_term works on empty engine.
get_term
---
term=0 vote=None

# Storing a 0 term errors.
!set_term 0
---
Panic: can't set term 0

# set_term stores a term and empty vote, writing it to the engine
# and flushing it to durable storage.
set_term 3 [ops]
get_term
---
engine set raft:TermVote → term=3 vote=None ["\x01" → "\x03\x00"]
engine flush
term=3 vote=None

# set_term stores a term and vote.
set_term 3 7 [ops]
get_term
---
engine set raft:TermVote → term=3 vote=7 ["\x01" → "\x03\x01\x07"]
engine flush
term=3 vote=7

# set_term is idempotent, which doesn't incur an engine write.
set_term 3 7 [ops]
get_term
---
term=3 vote=7

# Moving the term into the far future is allowed.
set_term 7
get_term
---
term=7 vote=None

# Starting a new term with a vote is allowed.
set_term 9 1
get_term
---
term=9 vote=1

# Regressing the term errors.
!set_term 8
---
Panic: term regression 9 → 8

# Clearing the vote errors.
!set_term 9
---
Panic: can't change vote

# Changing the vote errors.
!set_term 9 2
---
Panic: can't change vote

# The above errors should not have changed the term/vote.
get_term
dump
---
term=9 vote=1
raft:TermVote → term=9 vote=1 ["\x01" → "\t\x01\x01"]


================================================
FILE: src/raft/testscripts/node/append
================================================
# Can append single entries in steady state.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Propose a single write.
put 1 foo=bar
---
c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172
n1@1 append 2@1 put foo=bar
n1@1 → n2 Append base=1@1 [2@1]
n1@1 → n3 Append base=1@1 [2@1]

status
---
n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:1→3 3:1→3}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Append it to both followers.
deliver
---
n2@1 append 2@1 put foo=bar
n2@1 → n1 AppendResponse match_index=2
n3@1 append 2@1 put foo=bar
n3@1 → n1 AppendResponse match_index=2

# The leader commits and applies the write.
stabilize
---
n1@1 commit 2@1
n1@1 apply 2@1 put foo=bar
n1@1 → c1 ClientResponse id=0x01 write 0x0102
c1@1 put foo=bar ⇒ 2

status
---
n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3}
n2@1 follower(n1) last=2@1 commit=1@1 applied=1
n3@1 follower(n1) last=2@1 commit=1@1 applied=1


================================================
FILE: src/raft/testscripts/node/append_base_missing
================================================
# Appends with a base beyond the node's last log entry should result in a
# rejection at the index following the last entry, and the leader appending
# the tail of the log.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Partition n3 so that it does not receive writes.
partition 3
---
n3 ⇹ n1 n2

# Replicate a couple of writes.
(put 1 a=1)
(put 1 b=2)
(put 1 c=3)
(stabilize heartbeat=true)
status
---
n1@1 leader last=4@1 commit=4@1 applied=4 progress={2:4→5 3:1→5}
n2@1 follower(n1) last=4@1 commit=4@1 applied=4
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Heal the partition, and propose another write.
heal
put 1 c=3
---
n1 n2 n3 fully connected
c1@1 → n1 ClientRequest id=0x04 write 0x0101630133
n1@1 append 5@1 put c=3
n1@1 → n2 Append base=4@1 [5@1]
n1@1 → n3 Append base=4@1 [5@1]

# The 4@1 base is beyond n3's last index 1@1, so the append is rejected.
# However, the follower returns reject_index=2 immediately after its
# last index, rather than the original base index 4.
deliver 3
---
n3@1 → n1 AppendResponse reject_index=2

# Because index 1 is already matched with the leader, it doesn't have to probe
# and simply sends the entire tail, which is accepted.
deliver 1
status 1
---
n1@1 → n3 Append base=1@1 [2@1 3@1 4@1 5@1]
n1@1 leader last=5@1 commit=4@1 applied=4 progress={2:4→6 3:1→6}

deliver 3
---
n3@1 append 2@1 put a=1
n3@1 append 3@1 put b=2
n3@1 append 4@1 put c=3
n3@1 append 5@1 put c=3
n3@1 → n1 AppendResponse match_index=5

# When n1 receives the ack, it commits and applies the write.
deliver 1
---
n1@1 commit 5@1
n1@1 apply 5@1 put c=3
n1@1 → c1 ClientResponse id=0x04 write 0x0105
c1@1 put c=3 ⇒ 5

# The progress is also updated.
status
---
n1@1 leader last=5@1 commit=5@1 applied=5 progress={2:4→6 3:5→6}
n2@1 follower(n1) last=4@1 commit=4@1 applied=4
n3@1 follower(n1) last=5@1 commit=1@1 applied=1


================================================
FILE: src/raft/testscripts/node/append_base_missing_all
================================================
# Appends to a node with an empty log should result in a rejection of index 1,
# allowing the leader to send the entire log.

cluster nodes=3
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0

# Partition n3 so that it does not receive writes.
partition 3
---
n3 ⇹ n1 n2

# Elect n1 as leader.
(campaign 1)
(stabilize)
status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:0→2}
n2@1 follower(n1) last=1@1 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0

# Replicate a couple of writes.
(put 1 a=1)
(put 1 b=2)
(put 1 c=3)
(stabilize heartbeat=true)
status
---
n1@1 leader last=4@1 commit=4@1 applied=4 progress={2:4→5 3:0→5}
n2@1 follower(n1) last=4@1 commit=4@1 applied=4
n3@0 follower() last=0@0 commit=0@0 applied=0

# Heal the partition, and propose another write.
heal
put 1 c=3
---
n1 n2 n3 fully connected
c1@1 → n1 ClientRequest id=0x04 write 0x0101630133
n1@1 append 5@1 put c=3
n1@1 → n2 Append base=4@1 [5@1]
n1@1 → n3 Append base=4@1 [5@1]

# n3 has no entries, so it rejects with reject_index=1.
deliver 3
---
n3@0 follower() ⇨ n3@1 follower(n1)
n3@1 → n1 AppendResponse reject_index=1

# This allows n1 to send the entire log, without having to probe.
deliver 1
status 1
---
n1@1 → n3 Append base=0@0 [1@1 2@1 3@1 4@1 5@1]
n1@1 leader last=5@1 commit=4@1 applied=4 progress={2:4→6 3:0→6}

deliver 3
---
n3@1 append 1@1 None
n3@1 append 2@1 put a=1
n3@1 append 3@1 put b=2
n3@1 append 4@1 put c=3
n3@1 append 5@1 put c=3
n3@1 → n1 AppendResponse match_index=5

# When n1 receives the ack, it commits and applies the write.
deliver 1
---
n1@1 commit 5@1
n1@1 apply 5@1 put c=3
n1@1 → c1 ClientResponse id=0x04 write 0x0105
c1@1 put c=3 ⇒ 5

# The progress is also updated.
status
---
n1@1 leader last=5@1 commit=5@1 applied=5 progress={2:4→6 3:5→6}
n2@1 follower(n1) last=4@1 commit=4@1 applied=4
n3@1 follower(n1) last=5@1 commit=0@0 applied=0


================================================
FILE: src/raft/testscripts/node/append_commit_quorum
================================================
# Append results in a leader-side commit once a quorum is reached for the
# relevant entries.

cluster nodes=6 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2 6:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1
n6@1 follower(n1) last=1@1 commit=1@1 applied=1

# Incrementally disconnect all nodes except one and then propose a write, to
# generate an increasing quorum index.

# Replicating 2 to n2 does not commit.
partition 3 4 5 6
---
n1 n2 ⇹ n3 n4 n5 n6

put 1 a=1
stabilize
---
c1@1 → n1 ClientRequest id=0x01 write 0x0101610131
n1@1 append 2@1 put a=1
n1@1 → n2 Append base=1@1 [2@1]
n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶
n1@1 ⇥ n4 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶
n1@1 ⇥ n5 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶
n1@1 ⇥ n6 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶
n2@1 append 2@1 put a=1
n2@1 → n1 AppendResponse match_index=2

status
---
n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:2→3 3:1→3 4:1→3 5:1→3 6:1→3}
n2@1 follower(n1) last=2@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1
n6@1 follower(n1) last=1@1 commit=1@1 applied=1

# Replicating 2-3 to n3 does not commit.
heal
partition 2 4 5 6
---
n1 n2 n3 n4 n5 n6 fully connected
n1 n3 ⇹ n2 n4 n5 n6

put 1 b=2
stabilize
---
c1@1 → n1 ClientRequest id=0x02 write 0x0101620132
n1@1 append 3@1 put b=2
n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶2̶@̶1̶ ̶[̶3̶@̶1̶]̶
n1@1 → n3 Append base=2@1 [3@1]
n1@1 ⇥ n4 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶2̶@̶1̶ ̶[̶3̶@̶1̶]̶
n1@1 ⇥ n5 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶2̶@̶1̶ ̶[̶3̶@̶1̶]̶
n1@1 ⇥ n6 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶2̶@̶1̶ ̶[̶3̶@̶1̶]̶
n3@1 → n1 AppendResponse reject_index=2
n1@1 → n3 Append base=1@1 [2@1 3@1]
n3@1 append 2@1 put a=1
n3@1 append 3@1 put b=2
n3@1 → n1 AppendResponse match_index=3

status
---
n1@1 leader last=3@1 commit=1@1 applied=1 progress={2:2→4 3:3→4 4:1→4 5:1→4 6:1→4}
n2@1 follower(n1) last=2@1 commit=1@1 applied=1
n3@1 follower(n1) last=3@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1
n6@1 follower(n1) last=1@1 commit=1@1 applied=1

# Replicating 2-4 to n4 commits 2.
heal
partition 2 3 5 6
---
n1 n2 n3 n4 n5 n6 fully connected
n1 n4 ⇹ n2 n3 n5 n6

put 1 c=3
stabilize
---
c1@1 → n1 ClientRequest id=0x03 write 0x0101630133
n1@1 append 4@1 put c=3
n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶1̶]̶
n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶1̶]̶
n1@1 → n4 Append base=3@1 [4@1]
n1@1 ⇥ n5 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶1̶]̶
n1@1 ⇥ n6 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶1̶]̶
n4@1 → n1 AppendResponse reject_index=2
n1@1 → n4 Append base=1@1 [2@1 3@1 4@1]
n4@1 append 2@1 put a=1
n4@1 append 3@1 put b=2
n4@1 append 4@1 put c=3
n4@1 → n1 AppendResponse match_index=4
n1@1 commit 2@1
n1@1 apply 2@1 put a=1
n1@1 → c1 ClientResponse id=0x01 write 0x0102
c1@1 put a=1 ⇒ 2

status
---
n1@1 leader last=4@1 commit=2@1 applied=2 progress={2:2→5 3:3→5 4:4→5 5:1→5 6:1→5}
n2@1 follower(n1) last=2@1 commit=1@1 applied=1
n3@1 follower(n1) last=3@1 commit=1@1 applied=1
n4@1 follower(n1) last=4@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1
n6@1 follower(n1) last=1@1 commit=1@1 applied=1

# Replicating 2-5 to n5 commits 3.
heal
partition 2 3 4 6
---
n1 n2 n3 n4 n5 n6 fully connected
n1 n5 ⇹ n2 n3 n4 n6

put 1 d=4
stabilize
---
c1@1 → n1 ClientRequest id=0x04 write 0x0101640134
n1@1 append 5@1 put d=4
n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶4̶@̶1̶ ̶[̶5̶@̶1̶]̶
n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶4̶@̶1̶ ̶[̶5̶@̶1̶]̶
n1@1 ⇥ n4 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶4̶@̶1̶ ̶[̶5̶@̶1̶]̶
n1@1 → n5 Append base=4@1 [5@1]
n1@1 ⇥ n6 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶4̶@̶1̶ ̶[̶5̶@̶1̶]̶
n5@1 → n1 AppendResponse reject_index=2
n1@1 → n5 Append base=1@1 [2@1 3@1 4@1 5@1]
n5@1 append 2@1 put a=1
n5@1 append 3@1 put b=2
n5@1 append 4@1 put c=3
n5@1 append 5@1 put d=4
n5@1 → n1 AppendResponse match_index=5
n1@1 commit 3@1
n1@1 apply 3@1 put b=2
n1@1 → c1 ClientResponse id=0x02 write 0x0103
c1@1 put b=2 ⇒ 3

status
---
n1@1 leader last=5@1 commit=3@1 applied=3 progress={2:2→6 3:3→6 4:4→6 5:5→6 6:1→6}
n2@1 follower(n1) last=2@1 commit=1@1 applied=1
n3@1 follower(n1) last=3@1 commit=1@1 applied=1
n4@1 follower(n1) last=4@1 commit=1@1 applied=1
n5@1 follower(n1) last=5@1 commit=1@1 applied=1
n6@1 follower(n1) last=1@1 commit=1@1 applied=1

# Replicating 2-6 to n6 commits 4.
heal
partition 2 3 4 5
---
n1 n2 n3 n4 n5 n6 fully connected
n1 n6 ⇹ n2 n3 n4 n5

put 1 e=5
stabilize
---
c1@1 → n1 ClientRequest id=0x05 write 0x0101650135
n1@1 append 6@1 put e=5
n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶5̶@̶1̶ ̶[̶6̶@̶1̶]̶
n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶5̶@̶1̶ ̶[̶6̶@̶1̶]̶
n1@1 ⇥ n4 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶5̶@̶1̶ ̶[̶6̶@̶1̶]̶
n1@1 ⇥ n5 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶5̶@̶1̶ ̶[̶6̶@̶1̶]̶
n1@1 → n6 Append base=5@1 [6@1]
n6@1 → n1 AppendResponse reject_index=2
n1@1 → n6 Append base=1@1 [2@1 3@1 4@1 5@1 6@1]
n6@1 append 2@1 put a=1
n6@1 append 3@1 put b=2
n6@1 append 4@1 put c=3
n6@1 append 5@1 put d=4
n6@1 append 6@1 put e=5
n6@1 → n1 AppendResponse match_index=6
n1@1 commit 4@1
n1@1 apply 4@1 put c=3
n1@1 → c1 ClientResponse id=0x03 write 0x0104
c1@1 put c=3 ⇒ 4

status
---
n1@1 leader last=6@1 commit=4@1 applied=4 progress={2:2→7 3:3→7 4:4→7 5:5→7 6:6→7}
n2@1 follower(n1) last=2@1 commit=1@1 applied=1
n3@1 follower(n1) last=3@1 commit=1@1 applied=1
n4@1 follower(n1) last=4@1 commit=1@1 applied=1
n5@1 follower(n1) last=5@1 commit=1@1 applied=1
n6@1 follower(n1) last=6@1 commit=1@1 applied=1

# Healing the partition and proposing another write replicates and commits all
# entries.
heal
---
n1 n2 n3 n4 n5 n6 fully connected

put 1 f=6
stabilize
---
c1@1 → n1 ClientRequest id=0x06 write 0x0101660136
n1@1 append 7@1 put f=6
n1@1 → n2 Append base=6@1 [7@1]
n1@1 → n3 Append base=6@1 [7@1]
n1@1 → n4 Append base=6@1 [7@1]
n1@1 → n5 Append base=6@1 [7@1]
n1@1 → n6 Append base=6@1 [7@1]
n2@1 → n1 AppendResponse reject_index=3
n3@1 → n1 AppendResponse reject_index=4
n4@1 → n1 AppendResponse reject_index=5
n5@1 → n1 AppendResponse reject_index=6
n6@1 append 7@1 put f=6
n6@1 → n1 AppendResponse match_index=7
n1@1 → n2 Append base=2@1 [3@1 4@1 5@1 6@1 7@1]
n1@1 → n3 Append base=3@1 [4@1 5@1 6@1 7@1]
n1@1 → n4 Append base=4@1 [5@1 6@1 7@1]
n1@1 → n5 Append base=5@1 [6@1 7@1]
n2@1 append 3@1 put b=2
n2@1 append 4@1 put c=3
n2@1 append 5@1 put d=4
n2@1 append 6@1 put e=5
n2@1 append 7@1 put f=6
n2@1 → n1 AppendResponse match_index=7
n3@1 append 4@1 put c=3
n3@1 append 5@1 put d=4
n3@1 append 6@1 put e=5
n3@1 append 7@1 put f=6
n3@1 → n1 AppendResponse match_index=7
n4@1 append 5@1 put d=4
n4@1 append 6@1 put e=5
n4@1 append 7@1 put f=6
n4@1 → n1 AppendResponse match_index=7
n5@1 append 6@1 put e=5
n5@1 append 7@1 put f=6
n5@1 → n1 AppendResponse match_index=7
n1@1 commit 5@1
n1@1 apply 5@1 put d=4
n1@1 → c1 ClientResponse id=0x04 write 0x0105
c1@1 put d=4 ⇒ 5
n1@1 commit 7@1
n1@1 apply 6@1 put e=5
n1@1 apply 7@1 put f=6
n1@1 → c1 ClientResponse id=0x05 write 0x0106
c1@1 put e=5 ⇒ 6
n1@1 → c1 ClientResponse id=0x06 write 0x0107
c1@1 put f=6 ⇒ 7

status
---
n1@1 leader last=7@1 commit=7@1 applied=7 progress={2:7→8 3:7→8 4:7→8 5:7→8 6:7→8}
n2@1 follower(n1) last=7@1 commit=1@1 applied=1
n3@1 follower(n1) last=7@1 commit=1@1 applied=1
n4@1 follower(n1) last=7@1 commit=1@1 applied=1
n5@1 follower(n1) last=7@1 commit=1@1 applied=1
n6@1 follower(n1) last=7@1 commit=1@1 applied=1


================================================
FILE: src/raft/testscripts/node/append_initial
================================================
# An initial append at base 0 can have a single or multiple entries.

cluster nodes=3
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0

# Partition n3 so that is has an empty log.
partition 3
---
n3 ⇹ n1 n2

# n1 campaigns.
campaign 1
deliver
---
n1@0 follower() ⇨ n1@1 candidate
n1@1 → n2 Campaign last=0@0
n1@1 ⇥ n3 C̶a̶m̶p̶a̶i̶g̶n̶ ̶l̶a̶s̶t̶=̶0̶@̶0̶
n2@0 follower() ⇨ n2@1 follower()
n2@1 → n1 CampaignResponse vote=true

# When n1 wins, it successfully appends an entry at base 0 to n2.
stabilize
---
n1@1 candidate ⇨ n1@1 leader
n1@1 append 1@1 None
n1@1 → n2 Append base=0@0 [1@1]
n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶0̶@̶0̶ ̶[̶1̶@̶1̶]̶
n1@1 → n2 Heartbeat last_index=1 commit_index=0 read_seq=0
n1@1 ⇥ n3 H̶e̶a̶r̶t̶b̶e̶a̶t̶ ̶l̶a̶s̶t̶_̶i̶n̶d̶e̶x̶=̶1̶ ̶c̶o̶m̶m̶i̶t̶_̶i̶n̶d̶e̶x̶=̶0̶ ̶r̶e̶a̶d̶_̶s̶e̶q̶=̶0̶
n2@1 follower() ⇨ n2@1 follower(n1)
n2@1 append 1@1 None
n2@1 → n1 AppendResponse match_index=1
n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0
n1@1 commit 1@1
n1@1 apply 1@1 None

status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:0→2}
n2@1 follower(n1) last=1@1 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0

# Heal the partition.
heal
---
n1 n2 n3 fully connected

# Propose a write. This appends entry 2 to n2 at base 1, but is rejected by n3
# which doesn't have entry 1.
put 1 foo=bar
deliver
---
c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172
n1@1 append 2@1 put foo=bar
n1@1 → n2 Append base=1@1 [2@1]
n1@1 → n3 Append base=1@1 [2@1]
n2@1 append 2@1 put foo=bar
n2@1 → n1 AppendResponse match_index=2
n3@0 follower() ⇨ n3@1 follower(n1)
n3@1 → n1 AppendResponse reject_index=1

# Since n3 rejected base 1, n1 sends an append with all messages, which
# is accepted.
stabilize
---
n1@1 commit 2@1
n1@1 apply 2@1 put foo=bar
n1@1 → c1 ClientResponse id=0x01 write 0x0102
c1@1 put foo=bar ⇒ 2
n1@1 → n3 Append base=0@0 [1@1 2@1]
n3@1 append 1@1 None
n3@1 append 2@1 put foo=bar
n3@1 → n1 AppendResponse match_index=2

log
---
n1@1 term=1 last=2@1 commit=2@1 vote=Some(1)
n1@1 entry 1@1 None
n1@1 entry 2@1 put foo=bar
n2@1 term=1 last=2@1 commit=0@0 vote=Some(1)
n2@1 entry 1@1 None
n2@1 entry 2@1 put foo=bar
n3@1 term=1 last=2@1 commit=0@0 vote=None
n3@1 entry 1@1 None
n3@1 entry 2@1 put foo=bar


================================================
FILE: src/raft/testscripts/node/append_max_entries
================================================
# Large appends are limited to MAX_APPEND_ENTRIES, and each successful append
# triggers the next append batch.

cluster nodes=3 leader=1 max_append_entries=2
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Partition n3.
partition 3
---
n3 ⇹ n1 n2

# Make a bunch of writes.
(put 1 a=1)
(put 1 a=2)
(put 1 a=3)
(put 1 a=4)
(put 1 a=5)
(put 1 a=6)
(put 1 a=7)
(stabilize heartbeat=true)
status
---
n1@1 leader last=8@1 commit=8@1 applied=8 progress={2:8→9 3:1→9}
n2@1 follower(n1) last=8@1 commit=8@1 applied=8
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Heal the partition.
heal
---
n1 n2 n3 fully connected

# The next heartbeat triggers a probe.
heartbeat 1
deliver
deliver
deliver
---
n1@1 → n2 Heartbeat last_index=8 commit_index=8 read_seq=0
n1@1 → n3 Heartbeat last_index=8 commit_index=8 read_seq=0
n2@1 → n1 HeartbeatResponse match_index=8 read_seq=0
n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0
n1@1 → n3 Append base=7@1 []
n3@1 → n1 AppendResponse reject_index=2

# When the leader receives the probe response, it begins appending in batches of
# max_append_entries until the follower is caught up.
stabilize
---
n1@1 → n3 Append base=1@1 [2@1 3@1]
n3@1 append 2@1 put a=1
n3@1 append 3@1 put a=2
n3@1 → n1 AppendResponse match_index=3
n1@1 → n3 Append base=3@1 [4@1 5@1]
n3@1 append 4@1 put a=3
n3@1 append 5@1 put a=4
n3@1 → n1 AppendResponse match_index=5
n1@1 → n3 Append base=5@1 [6@1 7@1]
n3@1 append 6@1 put a=5
n3@1 append 7@1 put a=6
n3@1 → n1 AppendResponse match_index=7
n1@1 → n3 Append base=7@1 [8@1]
n3@1 append 8@1 put a=7
n3@1 → n1 AppendResponse match_index=8


================================================
FILE: src/raft/testscripts/node/append_pipeline
================================================
# Multiple appends are pipelined before acks are received, without
# retransmitting the unacked entries.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Propose a single write. The progress next index increases to 3.
put 1 a=1
---
c1@1 → n1 ClientRequest id=0x01 write 0x0101610131
n1@1 append 2@1 put a=1
n1@1 → n2 Append base=1@1 [2@1]
n1@1 → n3 Append base=1@1 [2@1]

status
---
n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:1→3 3:1→3}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Propose two more writes. Appends are sent without past duplicates.
put 1 b=2
put 1 c=3
---
c1@1 → n1 ClientRequest id=0x02 write 0x0101620132
n1@1 append 3@1 put b=2
n1@1 → n2 Append base=2@1 [3@1]
n1@1 → n3 Append base=2@1 [3@1]
c1@1 → n1 ClientRequest id=0x03 write 0x0101630133
n1@1 append 4@1 put c=3
n1@1 → n2 Append base=3@1 [4@1]
n1@1 → n3 Append base=3@1 [4@1]

status
---
n1@1 leader last=4@1 commit=1@1 applied=1 progress={2:1→5 3:1→5}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# The appends are received and acked sequentially.
deliver
---
n2@1 append 2@1 put a=1
n2@1 → n1 AppendResponse match_index=2
n2@1 append 3@1 put b=2
n2@1 → n1 AppendResponse match_index=3
n2@1 append 4@1 put c=3
n2@1 → n1 AppendResponse match_index=4
n3@1 append 2@1 put a=1
n3@1 → n1 AppendResponse match_index=2
n3@1 append 3@1 put b=2
n3@1 → n1 AppendResponse match_index=3
n3@1 append 4@1 put c=3
n3@1 → n1 AppendResponse match_index=4

# The leader receives the acks and commits the writes one by one,
# without retransmitting the in-flight (to it) entries.
deliver
---
n1@1 commit 2@1
n1@1 apply 2@1 put a=1
n1@1 → c1 ClientResponse id=0x01 write 0x0102
c1@1 put a=1 ⇒ 2
n1@1 commit 3@1
n1@1 apply 3@1 put b=2
n1@1 → c1 ClientResponse id=0x02 write 0x0103
c1@1 put b=2 ⇒ 3
n1@1 commit 4@1
n1@1 apply 4@1 put c=3
n1@1 → c1 ClientResponse id=0x03 write 0x0104
c1@1 put c=3 ⇒ 4

# All nodes are now caught up on logs (but not commit/apply, which needs a
# heartbeat).
status
---
n1@1 leader last=4@1 commit=4@1 applied=4 progress={2:4→5 3:4→5}
n2@1 follower(n1) last=4@1 commit=1@1 applied=1
n3@1 follower(n1) last=4@1 commit=1@1 applied=1


================================================
FILE: src/raft/testscripts/node/append_probe_divergent_first
================================================
# Appends to a previous leader and follower with a divergent tail all
# the way back to the first entry works.

cluster nodes=5 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1

# Partition n1-n2
partition 1 2
---
n1 n2 ⇹ n3 n4 n5

# Elect new leaders in the majority partition and replicate a few writes.
# Multiple leaders ensures the log has multiple terms.
(campaign 3)
(stabilize)
(put 3 a=1)
(stabilize heartbeat=true)
status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@2 leader last=3@2 commit=3@2 applied=3 progress={1:0→4 2:0→4 4:3→4 5:3→4}
n4@2 follower(n3) last=3@2 commit=3@2 applied=3
n5@2 follower(n3) last=3@2 commit=3@2 applied=3

(campaign 4)
(stabilize)
(put 4 b=2)
(stabilize heartbeat=true)
status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@3 follower(n4) last=5@3 commit=5@3 applied=5
n4@3 leader last=5@3 commit=5@3 applied=5 progress={1:0→6 2:0→6 3:5→6 5:5→6}
n5@3 follower(n4) last=5@3 commit=5@3 applied=5

(campaign 5)
(stabilize)
(put 5 c=3)
(stabilize heartbeat=true)
status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@4 follower(n5) last=7@4 commit=7@4 applied=7
n4@4 follower(n5) last=7@4 commit=7@4 applied=7
n5@4 leader last=7@4 commit=7@4 applied=7 progress={1:0→8 2:0→8 3:7→8 4:7→8}

# Propose writes in the minority partition as well.
(put 1 a=2)
(put 1 a=3)
(put 1 a=4)
(put 1 a=5)
(put 1 a=6)
(put 1 a=7)
(stabilize)
status
---
n1@1 leader last=7@1 commit=1@1 applied=1 progress={2:7→8 3:1→8 4:1→8 5:1→8}
n2@1 follower(n1) last=7@1 commit=1@1 applied=1
n3@4 follower(n5) last=7@4 commit=7@4 applied=7
n4@4 follower(n5) last=7@4 commit=7@4 applied=7
n5@4 leader last=7@4 commit=7@4 applied=7 progress={1:0→8 2:0→8 3:7→8 4:7→8}

log 1 5
---
n1@1 term=1 last=7@1 commit=1@1 vote=Some(1)
n1@1 entry 1@1 None
n1@1 entry 2@1 put a=2
n1@1 entry 3@1 put a=3
n1@1 entry 4@1 put a=4
n1@1 entry 5@1 put a=5
n1@1 entry 6@1 put a=6
n1@1 entry 7@1 put a=7
n5@4 term=4 last=7@4 commit=7@4 vote=Some(5)
n5@4 entry 1@1 None
n5@4 entry 2@2 None
n5@4 entry 3@2 put a=1
n5@4 entry 4@3 None
n5@4 entry 5@3 put b=2
n5@4 entry 6@4 None
n5@4 entry 7@4 put c=3

# Heal the partition.
heal
---
n1 n2 n3 n4 n5 fully connected

# Propose another write on the majority leader.
put 5 d=4
---
c5@4 → n5 ClientRequest id=0x0a write 0x0101640134
n5@4 append 8@4 put d=4
n5@4 → n1 Append base=7@4 [8@4]
n5@4 → n2 Append base=7@4 [8@4]
n5@4 → n3 Append base=7@4 [8@4]
n5@4 → n4 Append base=7@4 [8@4]

# Delivering the appends to n1 and n2 should reject them. It also cancels the
# in-flight write requests on n1.
deliver 1 2
---
n1@1 leader ⇨ n1@4 follower(n5)
n1@1 → c1 ClientResponse id=0x04 Error::Abort
c1@1 put a=2 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x05 Error::Abort
c1@1 put a=3 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x06 Error::Abort
c1@1 put a=4 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x07 Error::Abort
c1@1 put a=5 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x08 Error::Abort
c1@1 put a=6 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x09 Error::Abort
c1@1 put a=7 ⇒ Error::Abort (operation aborted)
n1@4 → n5 AppendResponse reject_index=7
n2@1 follower(n1) ⇨ n2@4 follower(n5)
n2@4 → n5 AppendResponse reject_index=7

# n5 will probe the previous base, which is again rejected. This repeats until
# a common base is found at 1@1.
deliver 5
status 5
deliver 1 2
---
n5@4 → n1 Append base=6@4 []
n5@4 → n2 Append base=6@4 []
n5@4 leader last=8@4 commit=7@4 applied=7 progress={1:0→7 2:0→7 3:7→9 4:7→9}
n1@4 → n5 AppendResponse reject_index=6
n2@4 → n5 AppendResponse reject_index=6

deliver 5
deliver 1 2
status 5
---
n5@4 → n1 Append base=5@3 []
n5@4 → n2 Append base=5@3 []
n1@4 → n5 AppendResponse reject_index=5
n2@4 → n5 AppendResponse reject_index=5
n5@4 leader last=8@4 commit=7@4 applied=7 progress={1:0→6 2:0→6 3:7→9 4:7→9}

deliver 5
status 5
deliver 1 2
---
n5@4 → n1 Append base=4@3 []
n5@4 → n2 Append base=4@3 []
n5@4 leader last=8@4 commit=7@4 applied=7 progress={1:0→5 2:0→5 3:7→9 4:7→9}
n1@4 → n5 AppendResponse reject_index=4
n2@4 → n5 AppendResponse reject_index=4

deliver 5
status 5
deliver 1 2
---
n5@4 → n1 Append base=3@2 []
n5@4 → n2 Append base=3@2 []
n5@4 leader last=8@4 commit=7@4 applied=7 progress={1:0→4 2:0→4 3:7→9 4:7→9}
n1@4 → n5 AppendResponse reject_index=3
n2@4 → n5 AppendResponse reject_index=3

deliver 5
status 5
deliver 1 2
---
n5@4 → n1 Append base=2@2 []
n5@4 → n2 Append base=2@2 []
n5@4 leader last=8@4 commit=7@4 applied=7 progress={1:0→3 2:0→3 3:7→9 4:7→9}
n1@4 → n5 AppendResponse reject_index=2
n2@4 → n5 AppendResponse reject_index=2

deliver 5
status 5
deliver 1 2
---
n5@4 → n1 Append base=1@1 []
n5@4 → n2 Append base=1@1 []
n5@4 leader last=8@4 commit=7@4 applied=7 progress={1:0→2 2:0→2 3:7→9 4:7→9}
n1@4 → n5 AppendResponse match_index=1
n2@4 → n5 AppendResponse match_index=1

# n5 can now replicate the tail to n1 and n2, allowing n5 to commit it.
deliver 5
status 5
deliver 1 2
---
n5@4 → n1 Append base=1@1 [2@2 3@2 4@3 5@3 6@4 7@4 8@4]
n5@4 → n2 Append base=1@1 [2@2 3@2 4@3 5@3 6@4 7@4 8@4]
n5@4 leader last=8@4 commit=7@4 applied=7 progress={1:1→9 2:1→9 3:7→9 4:7→9}
n1@4 append 2@2 None
n1@4 append 3@2 put a=1
n1@4 append 4@3 None
n1@4 append 5@3 put b=2
n1@4 append 6@4 None
n1@4 append 7@4 put c=3
n1@4 append 8@4 put d=4
n1@4 → n5 AppendResponse match_index=8
n2@4 append 2@2 None
n2@4 append 3@2 put a=1
n2@4 append 4@3 None
n2@4 append 5@3 put b=2
n2@4 append 6@4 None
n2@4 append 7@4 put c=3
n2@4 append 8@4 put d=4
n2@4 → n5 AppendResponse match_index=8

deliver 5
---
n5@4 commit 8@4
n5@4 apply 8@4 put d=4
n5@4 → c5 ClientResponse id=0x0a write 0x0108
c5@4 put d=4 ⇒ 8

status
---
n1@4 follower(n5) last=8@4 commit=1@1 applied=1
n2@4 follower(n5) last=8@4 commit=1@1 applied=1
n3@4 follower(n5) last=7@4 commit=7@4 applied=7
n4@4 follower(n5) last=7@4 commit=7@4 applied=7
n5@4 leader last=8@4 commit=8@4 applied=8 progress={1:8→9 2:8→9 3:7→9 4:7→9}

# Stabilize the cluster.
(stabilize heartbeat=true)
status
---
n1@4 follower(n5) last=8@4 commit=8@4 applied=8
n2@4 follower(n5) last=8@4 commit=8@4 applied=8
n3@4 follower(n5) last=8@4 commit=8@4 applied=8
n4@4 follower(n5) last=8@4 commit=8@4 applied=8
n5@4 leader last=8@4 commit=8@4 applied=8 progress={1:8→9 2:8→9 3:8→9 4:8→9}


================================================
FILE: src/raft/testscripts/node/append_probe_divergent_long
================================================
# Appends to a previous leader and follower with a long divergent tail requires
# the leader to repeatedly probe until it finds a common base.

cluster nodes=5 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1

# Make a couple of writes to ensure a common log prefix.
(put 1 a=1)
(put 1 b=2)
(stabilize)
status
---
n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4}
n2@1 follower(n1) last=3@1 commit=1@1 applied=1
n3@1 follower(n1) last=3@1 commit=1@1 applied=1
n4@1 follower(n1) last=3@1 commit=1@1 applied=1
n5@1 follower(n1) last=3@1 commit=1@1 applied=1

# Partition n1-n2
partition 1 2
---
n1 n2 ⇹ n3 n4 n5

# Elect new leaders in the majority partition and replicate a few writes.
# Multiple leaders ensures the log has multiple terms.
(campaign 3)
(stabilize)
(put 3 c=3)
(stabilize heartbeat=true)
status
---
n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4}
n2@1 follower(n1) last=3@1 commit=1@1 applied=1
n3@2 leader last=5@2 commit=5@2 applied=5 progress={1:0→6 2:0→6 4:5→6 5:5→6}
n4@2 follower(n3) last=5@2 commit=5@2 applied=5
n5@2 follower(n3) last=5@2 commit=5@2 applied=5

(campaign 4)
(stabilize)
(put 4 d=4)
(stabilize heartbeat=true)
status
---
n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4}
n2@1 follower(n1) last=3@1 commit=1@1 applied=1
n3@3 follower(n4) last=7@3 commit=7@3 applied=7
n4@3 leader last=7@3 commit=7@3 applied=7 progress={1:0→8 2:0→8 3:7→8 5:7→8}
n5@3 follower(n4) last=7@3 commit=7@3 applied=7

(campaign 5)
(stabilize)
(put 5 e=5)
(stabilize heartbeat=true)
status
---
n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4}
n2@1 follower(n1) last=3@1 commit=1@1 applied=1
n3@4 follower(n5) last=9@4 commit=9@4 applied=9
n4@4 follower(n5) last=9@4 commit=9@4 applied=9
n5@4 leader last=9@4 commit=9@4 applied=9 progress={1:0→10 2:0→10 3:9→10 4:9→10}

# Propose writes in the minority partition as well, to build up a log
# longer than the majority log.
(put 1 a=2)
(put 1 a=3)
(put 1 a=4)
(put 1 a=5)
(put 1 a=6)
(put 1 a=7)
(put 1 a=8)
(put 1 a=9)
(put 1 a=10)
(stabilize)
status
---
n1@1 leader last=12@1 commit=3@1 applied=3 progress={2:12→13 3:3→13 4:3→13 5:3→13}
n2@1 follower(n1) last=12@1 commit=1@1 applied=1
n3@4 follower(n5) last=9@4 commit=9@4 applied=9
n4@4 follower(n5) last=9@4 commit=9@4 applied=9
n5@4 leader last=9@4 commit=9@4 applied=9 progress={1:0→10 2:0→10 3:9→10 4:9→10}

log 1 5
---
n1@1 term=1 last=12@1 commit=3@1 vote=Some(1)
n1@1 entry 1@1 None
n1@1 entry 2@1 put a=1
n1@1 entry 3@1 put b=2
n1@1 entry 4@1 put a=2
n1@1 entry 5@1 put a=3
n1@1 entry 6@1 put a=4
n1@1 entry 7@1 put a=5
n1@1 entry 8@1 put a=6
n1@1 entry 9@1 put a=7
n1@1 entry 10@1 put a=8
n1@1 entry 11@1 put a=9
n1@1 entry 12@1 put a=10
n5@4 term=4 last=9@4 commit=9@4 vote=Some(5)
n5@4 entry 1@1 None
n5@4 entry 2@1 put a=1
n5@4 entry 3@1 put b=2
n5@4 entry 4@2 None
n5@4 entry 5@2 put c=3
n5@4 entry 6@3 None
n5@4 entry 7@3 put d=4
n5@4 entry 8@4 None
n5@4 entry 9@4 put e=5

# Heal the partition.
heal
---
n1 n2 n3 n4 n5 fully connected

# Propose another write on the majority leader.
put 5 f=6
---
c5@4 → n5 ClientRequest id=0x0f write 0x0101660136
n5@4 append 10@4 put f=6
n5@4 → n1 Append base=9@4 [10@4]
n5@4 → n2 Append base=9@4 [10@4]
n5@4 → n3 Append base=9@4 [10@4]
n5@4 → n4 Append base=9@4 [10@4]

# Delivering the appends to n1 and n2 should reject them. It also cancels the
# in-flight write requests on n1.
deliver 1 2
---
n1@1 leader ⇨ n1@4 follower(n5)
n1@1 → c1 ClientResponse id=0x06 Error::Abort
c1@1 put a=2 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x07 Error::Abort
c1@1 put a=3 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x08 Error::Abort
c1@1 put a=4 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x09 Error::Abort
c1@1 put a=5 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x0a Error::Abort
c1@1 put a=6 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x0b Error::Abort
c1@1 put a=7 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x0c Error::Abort
c1@1 put a=8 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x0d Error::Abort
c1@1 put a=9 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x0e Error::Abort
c1@1 put a=10 ⇒ Error::Abort (operation aborted)
n1@4 → n5 AppendResponse reject_index=9
n2@1 follower(n1) ⇨ n2@4 follower(n5)
n2@4 → n5 AppendResponse reject_index=9

# n5 will probe the previous base, which is again rejected. This repeats until
# a common base is found at 3@1.
deliver 5
status 5
deliver 1 2
---
n5@4 → n1 Append base=8@4 []
n5@4 → n2 Append base=8@4 []
n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→9 2:0→9 3:9→11 4:9→11}
n1@4 → n5 AppendResponse reject_index=8
n2@4 → n5 AppendResponse reject_index=8

deliver 5
deliver 1 2
status 5
---
n5@4 → n1 Append base=7@3 []
n5@4 → n2 Append base=7@3 []
n1@4 → n5 AppendResponse reject_index=7
n2@4 → n5 AppendResponse reject_index=7
n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→8 2:0→8 3:9→11 4:9→11}

deliver 5
status 5
deliver 1 2
---
n5@4 → n1 Append base=6@3 []
n5@4 → n2 Append base=6@3 []
n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→7 2:0→7 3:9→11 4:9→11}
n1@4 → n5 AppendResponse reject_index=6
n2@4 → n5 AppendResponse reject_index=6

deliver 5
status 5
deliver 1 2
---
n5@4 → n1 Append base=5@2 []
n5@4 → n2 Append base=5@2 []
n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→6 2:0→6 3:9→11 4:9→11}
n1@4 → n5 AppendResponse reject_index=5
n2@4 → n5 AppendResponse reject_index=5

deliver 5
status 5
deliver 1 2
---
n5@4 → n1 Append base=4@2 []
n5@4 → n2 Append base=4@2 []
n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→5 2:0→5 3:9→11 4:9→11}
n1@4 → n5 AppendResponse reject_index=4
n2@4 → n5 AppendResponse reject_index=4

deliver 5
status 5
deliver 1 2
---
n5@4 → n1 Append base=3@1 []
n5@4 → n2 Append base=3@1 []
n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→4 2:0→4 3:9→11 4:9→11}
n1@4 → n5 AppendResponse match_index=3
n2@4 → n5 AppendResponse match_index=3

# n5 can now replicate the tail to n1 and n2, allowing n5 to commit it.
deliver 5
status 5
deliver 1 2
---
n5@4 → n1 Append base=3@1 [4@2 5@2 6@3 7@3 8@4 9@4 10@4]
n5@4 → n2 Append base=3@1 [4@2 5@2 6@3 7@3 8@4 9@4 10@4]
n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:3→11 2:3→11 3:9→11 4:9→11}
n1@4 append 4@2 None
n1@4 append 5@2 put c=3
n1@4 append 6@3 None
n1@4 append 7@3 put d=4
n1@4 append 8@4 None
n1@4 append 9@4 put e=5
n1@4 append 10@4 put f=6
n1@4 → n5 AppendResponse match_index=10
n2@4 append 4@2 None
n2@4 append 5@2 put c=3
n2@4 append 6@3 None
n2@4 append 7@3 put d=4
n2@4 append 8@4 None
n2@4 append 9@4 put e=5
n2@4 append 10@4 put f=6
n2@4 → n5 AppendResponse match_index=10

deliver 5
---
n5@4 commit 10@4
n5@4 apply 10@4 put f=6
n5@4 → c5 ClientResponse id=0x0f write 0x010a
c5@4 put f=6 ⇒ 10

status
---
n1@4 follower(n5) last=10@4 commit=3@1 applied=3
n2@4 follower(n5) last=10@4 commit=1@1 applied=1
n3@4 follower(n5) last=9@4 commit=9@4 applied=9
n4@4 follower(n5) last=9@4 commit=9@4 applied=9
n5@4 leader last=10@4 commit=10@4 applied=10 progress={1:10→11 2:10→11 3:9→11 4:9→11}

# Stabilize the cluster.
(stabilize heartbeat=true)
status
---
n1@4 follower(n5) last=10@4 commit=10@4 applied=10
n2@4 follower(n5) last=10@4 commit=10@4 applied=10
n3@4 follower(n5) last=10@4 commit=10@4 applied=10
n4@4 follower(n5) last=10@4 commit=10@4 applied=10
n5@4 leader last=10@4 commit=10@4 applied=10 progress={1:10→11 2:10→11 3:10→11 4:10→11}


================================================
FILE: src/raft/testscripts/node/append_probe_divergent_short
================================================
# Appends to a previous leader and follower with a shorter divergent tail skips
# the missing entries before probing.

cluster nodes=5 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1

# Make a couple of writes to ensure a common log prefix.
(put 1 a=1)
(put 1 b=2)
(stabilize)
status
---
n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4}
n2@1 follower(n1) last=3@1 commit=1@1 applied=1
n3@1 follower(n1) last=3@1 commit=1@1 applied=1
n4@1 follower(n1) last=3@1 commit=1@1 applied=1
n5@1 follower(n1) last=3@1 commit=1@1 applied=1

# Partition n1-n2
partition 1 2
---
n1 n2 ⇹ n3 n4 n5

# Elect new leaders in the majority partition and replicate a few writes.
# Multiple leaders ensures the log has multiple terms.
(campaign 3)
(stabilize)
(put 3 c=3)
(stabilize heartbeat=true)
status
---
n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4}
n2@1 follower(n1) last=3@1 commit=1@1 applied=1
n3@2 leader last=5@2 commit=5@2 applied=5 progress={1:0→6 2:0→6 4:5→6 5:5→6}
n4@2 follower(n3) last=5@2 commit=5@2 applied=5
n5@2 follower(n3) last=5@2 commit=5@2 applied=5

(campaign 4)
(stabilize)
(put 4 d=4)
(stabilize heartbeat=true)
status
---
n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4}
n2@1 follower(n1) last=3@1 commit=1@1 applied=1
n3@3 follower(n4) last=7@3 commit=7@3 applied=7
n4@3 leader last=7@3 commit=7@3 applied=7 progress={1:0→8 2:0→8 3:7→8 5:7→8}
n5@3 follower(n4) last=7@3 commit=7@3 applied=7

(campaign 5)
(stabilize)
(put 5 e=5)
(stabilize heartbeat=true)
status
---
n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4 4:3→4 5:3→4}
n2@1 follower(n1) last=3@1 commit=1@1 applied=1
n3@4 follower(n5) last=9@4 commit=9@4 applied=9
n4@4 follower(n5) last=9@4 commit=9@4 applied=9
n5@4 leader last=9@4 commit=9@4 applied=9 progress={1:0→10 2:0→10 3:9→10 4:9→10}

# Propose a single write in the minority partition. The divergent minority log
# is much shorter than the majority log.
(put 1 a=2)
(stabilize)
status
---
n1@1 leader last=4@1 commit=3@1 applied=3 progress={2:4→5 3:3→5 4:3→5 5:3→5}
n2@1 follower(n1) last=4@1 commit=1@1 applied=1
n3@4 follower(n5) last=9@4 commit=9@4 applied=9
n4@4 follower(n5) last=9@4 commit=9@4 applied=9
n5@4 leader last=9@4 commit=9@4 applied=9 progress={1:0→10 2:0→10 3:9→10 4:9→10}

log 1 5
---
n1@1 term=1 last=4@1 commit=3@1 vote=Some(1)
n1@1 entry 1@1 None
n1@1 entry 2@1 put a=1
n1@1 entry 3@1 put b=2
n1@1 entry 4@1 put a=2
n5@4 term=4 last=9@4 commit=9@4 vote=Some(5)
n5@4 entry 1@1 None
n5@4 entry 2@1 put a=1
n5@4 entry 3@1 put b=2
n5@4 entry 4@2 None
n5@4 entry 5@2 put c=3
n5@4 entry 6@3 None
n5@4 entry 7@3 put d=4
n5@4 entry 8@4 None
n5@4 entry 9@4 put e=5

# Heal the partition.
heal
---
n1 n2 n3 n4 n5 fully connected

# Propose another write on the majority leader.
put 5 f=6
---
c5@4 → n5 ClientRequest id=0x07 write 0x0101660136
n5@4 append 10@4 put f=6
n5@4 → n1 Append base=9@4 [10@4]
n5@4 → n2 Append base=9@4 [10@4]
n5@4 → n3 Append base=9@4 [10@4]
n5@4 → n4 Append base=9@4 [10@4]

# Delivering the appends to n1 and n2 should reject them, but with a
# reject_index=5 after their last index instead of the original base 9. It also
# cancels the in-flight write requests on n1.
deliver 1 2
---
n1@1 leader ⇨ n1@4 follower(n5)
n1@1 → c1 ClientResponse id=0x06 Error::Abort
c1@1 put a=2 ⇒ Error::Abort (operation aborted)
n1@4 → n5 AppendResponse reject_index=5
n2@1 follower(n1) ⇨ n2@4 follower(n5)
n2@4 → n5 AppendResponse reject_index=5

# n5 will probe the previous base, which is again rejected. This repeats until
# a common base is found at 3@1.
deliver 5
status 5
deliver 1 2
---
n5@4 → n1 Append base=4@2 []
n5@4 → n2 Append base=4@2 []
n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→5 2:0→5 3:9→11 4:9→11}
n1@4 → n5 AppendResponse reject_index=4
n2@4 → n5 AppendResponse reject_index=4

deliver 5
deliver 1 2
status 5
---
n5@4 → n1 Append base=3@1 []
n5@4 → n2 Append base=3@1 []
n1@4 → n5 AppendResponse match_index=3
n2@4 → n5 AppendResponse match_index=3
n5@4 leader last=10@4 commit=9@4 applied=9 progress={1:0→4 2:0→4 3:9→11 4:9→11}

# n5 can now replicate the tail to n1 and n2, allowing n5 to commit it.
deliver 5
deliver 1 2
---
n5@4 → n1 Append base=3@1 [4@2 5@2 6@3 7@3 8@4 9@4 10@4]
n5@4 → n2 Append base=3@1 [4@2 5@2 6@3 7@3 8@4 9@4 10@4]
n1@4 append 4@2 None
n1@4 append 5@2 put c=3
n1@4 append 6@3 None
n1@4 append 7@3 put d=4
n1@4 append 8@4 None
n1@4 append 9@4 put e=5
n1@4 append 10@4 put f=6
n1@4 → n5 AppendResponse match_index=10
n2@4 append 4@2 None
n2@4 append 5@2 put c=3
n2@4 append 6@3 None
n2@4 append 7@3 put d=4
n2@4 append 8@4 None
n2@4 append 9@4 put e=5
n2@4 append 10@4 put f=6
n2@4 → n5 AppendResponse match_index=10

deliver 5
---
n5@4 commit 10@4
n5@4 apply 10@4 put f=6
n5@4 → c5 ClientResponse id=0x07 write 0x010a
c5@4 put f=6 ⇒ 10

status
---
n1@4 follower(n5) last=10@4 commit=3@1 applied=3
n2@4 follower(n5) last=10@4 commit=1@1 applied=1
n3@4 follower(n5) last=9@4 commit=9@4 applied=9
n4@4 follower(n5) last=9@4 commit=9@4 applied=9
n5@4 leader last=10@4 commit=10@4 applied=10 progress={1:10→11 2:10→11 3:9→11 4:9→11}

# Stabilize the cluster.
(stabilize heartbeat=true)
status
---
n1@4 follower(n5) last=10@4 commit=10@4 applied=10
n2@4 follower(n5) last=10@4 commit=10@4 applied=10
n3@4 follower(n5) last=10@4 commit=10@4 applied=10
n4@4 follower(n5) last=10@4 commit=10@4 applied=10
n5@4 leader last=10@4 commit=10@4 applied=10 progress={1:10→11 2:10→11 3:10→11 4:10→11}


================================================
FILE: src/raft/testscripts/node/append_probe_divergent_single
================================================
# An append replaces a conflict at the tail for a single term.

cluster nodes=5 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1

# Partition n3-n5.
partition 3 4 5
---
n1 n2 ⇹ n3 n4 n5

# Propose and replicate a write in the minority partition.
put 1 a=1
stabilize
---
c1@1 → n1 ClientRequest id=0x01 write 0x0101610131
n1@1 append 2@1 put a=1
n1@1 → n2 Append base=1@1 [2@1]
n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶
n1@1 ⇥ n4 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶
n1@1 ⇥ n5 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶
n2@1 append 2@1 put a=1
n2@1 → n1 AppendResponse match_index=2

log 1 2
---
n1@1 term=1 last=2@1 commit=1@1 vote=Some(1)
n1@1 entry 1@1 None
n1@1 entry 2@1 put a=1
n2@1 term=1 last=2@1 commit=1@1 vote=Some(1)
n2@1 entry 1@1 None
n2@1 entry 2@1 put a=1

# Elect n5 as a new majority partition leader. It appends an empty entry.
(campaign 5)
(stabilize heartbeat=true)
status
---
n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:2→3 3:1→3 4:1→3 5:1→3}
n2@1 follower(n1) last=2@1 commit=1@1 applied=1
n3@2 follower(n5) last=2@2 commit=2@2 applied=2
n4@2 follower(n5) last=2@2 commit=2@2 applied=2
n5@2 leader last=2@2 commit=2@2 applied=2 progress={1:0→3 2:0→3 3:2→3 4:2→3}

# Heal the partition and propose a new write.
heal
put 5 b=2
---
n1 n2 n3 n4 n5 fully connected
c5@2 → n5 ClientRequest id=0x02 write 0x0101620132
n5@2 append 3@2 put b=2
n5@2 → n1 Append base=2@2 [3@2]
n5@2 → n2 Append base=2@2 [3@2]
n5@2 → n3 Append base=2@2 [3@2]
n5@2 → n4 Append base=2@2 [3@2]

# Delivering the append messages to n1,n2 will make them follow n5 and
# reject the appends due to a log mismatch.
deliver 1 2
---
n1@1 leader ⇨ n1@2 follower(n5)
n1@1 → c1 ClientResponse id=0x01 Error::Abort
c1@1 put a=1 ⇒ Error::Abort (operation aborted)
n1@2 → n5 AppendResponse reject_index=2
n2@1 follower(n1) ⇨ n2@2 follower(n5)
n2@2 → n5 AppendResponse reject_index=2

# n5 probes index 1, which succeeds. 1 and 2 still has the old logs.
deliver 5
deliver 1 2
---
n5@2 → n1 Append base=1@1 []
n5@2 → n2 Append base=1@1 []
n1@2 → n5 AppendResponse match_index=1
n2@2 → n5 AppendResponse match_index=1

log 1 2
---
n1@2 term=2 last=2@1 commit=1@1 vote=None
n1@2 entry 1@1 None
n1@2 entry 2@1 put a=1
n2@2 term=2 last=2@1 commit=1@1 vote=None
n2@2 entry 1@1 None
n2@2 entry 2@1 put a=1

# n5 now replicates the tail of its log, which replaces the old logs.
deliver 5
deliver 1 2
---
n5@2 → n1 Append base=1@1 [2@2 3@2]
n5@2 → n2 Append base=1@1 [2@2 3@2]
n1@2 append 2@2 None
n1@2 append 3@2 put b=2
n1@2 → n5 AppendResponse match_index=3
n2@2 append 2@2 None
n2@2 append 3@2 put b=2
n2@2 → n5 AppendResponse match_index=3

log 1 2
---
n1@2 term=2 last=3@2 commit=1@1 vote=None
n1@2 entry 1@1 None
n1@2 entry 2@2 None
n1@2 entry 3@2 put b=2
n2@2 term=2 last=3@2 commit=1@1 vote=None
n2@2 entry 1@1 None
n2@2 entry 2@2 None
n2@2 entry 3@2 put b=2

# Stabilize the cluster.
(stabilize heartbeat=true)
status
---
n1@2 follower(n5) last=3@2 commit=3@2 applied=3
n2@2 follower(n5) last=3@2 commit=3@2 applied=3
n3@2 follower(n5) last=3@2 commit=3@2 applied=3
n4@2 follower(n5) last=3@2 commit=3@2 applied=3
n5@2 leader last=3@2 commit=3@2 applied=3 progress={1:3→4 2:3→4 3:3→4 4:3→4}


================================================
FILE: src/raft/testscripts/node/append_response_beyond_last_index_panics
================================================
# A successful AppendResponse with last index beyond leader's last log
# should panic.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Propose a write.
put 1 foo=bar
---
c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172
n1@1 append 2@1 put foo=bar
n1@1 → n2 Append base=1@1 [2@1]
n1@1 → n3 Append base=1@1 [2@1]

# An AppendResponse beyond leader's last log should panic.
!step 1 '{"from":2, "to":1, "term":1, "message":{"AppendResponse":{"match_index":3,"reject_index":0}}}'
---
Panic: future match index


================================================
FILE: src/raft/testscripts/node/append_response_stale_reject
================================================
# A successful AppendResponse with a reject_index below the match index
# should be ignored.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Replicate a write.
(put 1 a=1)
(stabilize heartbeat=true)
status
---
n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3}
n2@1 follower(n1) last=2@1 commit=2@1 applied=2
n3@1 follower(n1) last=2@1 commit=2@1 applied=2

# Propose a few writes.
(put 1 b=2)
(put 1 c=3)
status
---
n1@1 leader last=4@1 commit=2@1 applied=2 progress={2:2→5 3:2→5}
n2@1 follower(n1) last=2@1 commit=2@1 applied=2
n3@1 follower(n1) last=2@1 commit=2@1 applied=2

# A reject_index below the follower's progress match index is ignored.
step 1 '{"from":2,"to":1,"term":1,"message":{"AppendResponse":{"match_index":0,"reject_index":2}}}'
status
---
n1@1 leader last=4@1 commit=2@1 applied=2 progress={2:2→5 3:2→5}
n2@1 follower(n1) last=2@1 commit=2@1 applied=2
n3@1 follower(n1) last=2@1 commit=2@1 applied=2

step 1 '{"from":2,"to":1,"term":1,"message":{"AppendResponse":{"match_index":0,"reject_index":1}}}'
status
---
n1@1 leader last=4@1 commit=2@1 applied=2 progress={2:2→5 3:2→5}
n2@1 follower(n1) last=2@1 commit=2@1 applied=2
n3@1 follower(n1) last=2@1 commit=2@1 applied=2

# The writes are still replicated without any probes.
stabilize
---
n2@1 append 3@1 put b=2
n2@1 → n1 AppendResponse match_index=3
n2@1 append 4@1 put c=3
n2@1 → n1 AppendResponse match_index=4
n3@1 append 3@1 put b=2
n3@1 → n1 AppendResponse match_index=3
n3@1 append 4@1 put c=3
n3@1 → n1 AppendResponse match_index=4
n1@1 commit 3@1
n1@1 apply 3@1 put b=2
n1@1 → c1 ClientResponse id=0x02 write 0x0103
c1@1 put b=2 ⇒ 3
n1@1 commit 4@1
n1@1 apply 4@1 put c=3
n1@1 → c1 ClientResponse id=0x03 write 0x0104
c1@1 put c=3 ⇒ 4


================================================
FILE: src/raft/testscripts/node/election
================================================
# A node campaigns and wins leadership once the election timeout passes. Uses
# ticks directly to also test tick handling.

cluster nodes=3 heartbeat_interval=1 election_timeout=2
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0

# Tick all nodes. Then tick n1 again to make it campaign.
tick
---
ok

tick 1
---
n1@0 follower() ⇨ n1@1 candidate
n1@1 → n2 Campaign last=0@0
n1@1 → n3 Campaign last=0@0

# n2,n3 grant n1 their votes.
deliver
---
n2@0 follower() ⇨ n2@1 follower()
n2@1 → n1 CampaignResponse vote=true
n3@0 follower() ⇨ n3@1 follower()
n3@1 → n1 CampaignResponse vote=true

# n1 wins the election and becomes leader.
deliver
---
n1@1 candidate ⇨ n1@1 leader
n1@1 append 1@1 None
n1@1 → n2 Append base=0@0 [1@1]
n1@1 → n3 Append base=0@0 [1@1]
n1@1 → n2 Heartbeat last_index=1 commit_index=0 read_seq=0
n1@1 → n3 Heartbeat last_index=1 commit_index=0 read_seq=0

# All nodes become n1 followers.
stabilize
---
n2@1 follower() ⇨ n2@1 follower(n1)
n2@1 append 1@1 None
n2@1 → n1 AppendResponse match_index=1
n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0
n3@1 follower() ⇨ n3@1 follower(n1)
n3@1 append 1@1 None
n3@1 → n1 AppendResponse match_index=1
n3@1 → n1 HeartbeatResponse match_index=1 read_seq=0
n1@1 commit 1@1
n1@1 apply 1@1 None

# n1's heartbeats are accepted by followers, who commit and apply the entry.
tick 1
---
n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0

stabilize
---
n2@1 commit 1@1
n2@1 apply 1@1 None
n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0
n3@1 commit 1@1
n3@1 apply 1@1 None
n3@1 → n1 HeartbeatResponse match_index=1 read_seq=0

status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1


================================================
FILE: src/raft/testscripts/node/election_candidate_behind_leader
================================================
# A candidate that lags behind the leader can still win the election
# as long as it isn't behind the quorum.

cluster nodes=5 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1

# Partition n1+n2 away from the cluster.
partition 1 2
---
n1 n2 ⇹ n3 n4 n5

# Replica a write on n1+n2. The write can't be committed, because n1 doesn't
# have quorum.
(put 1 foo=bar)
(stabilize)
status
---
n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:2→3 3:1→3 4:1→3 5:1→3}
n2@1 follower(n1) last=2@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1

# List the logs on n1 n2 n3 to show the replicated but uncommitted entry.
log 1 2 3
---
n1@1 term=1 last=2@1 commit=1@1 vote=Some(1)
n1@1 entry 1@1 None
n1@1 entry 2@1 put foo=bar
n2@1 term=1 last=2@1 commit=1@1 vote=Some(1)
n2@1 entry 1@1 None
n2@1 entry 2@1 put foo=bar
n3@1 term=1 last=1@1 commit=1@1 vote=Some(1)
n3@1 entry 1@1 None

# Heal the partition.
heal
---
n1 n2 n3 n4 n5 fully connected

# Make n5 campaign. n3+n4 grant their votes, n1+n2 reject it. n1 aborts the
# in-flight write request because the term changes.
campaign 5
deliver
---
n5@1 follower(n1) ⇨ n5@2 candidate
n5@2 → n1 Campaign last=1@1
n5@2 → n2 Campaign last=1@1
n5@2 → n3 Campaign last=1@1
n5@2 → n4 Campaign last=1@1
n1@1 leader ⇨ n1@2 follower()
n1@1 → c1 ClientResponse id=0x01 Error::Abort
c1@1 put foo=bar ⇒ Error::Abort (operation aborted)
n1@2 → n5 CampaignResponse vote=false
n2@1 follower(n1) ⇨ n2@2 follower()
n2@2 → n5 CampaignResponse vote=false
n3@1 follower(n1) ⇨ n3@2 follower()
n3@2 → n5 CampaignResponse vote=true
n4@1 follower(n1) ⇨ n4@2 follower()
n4@2 → n5 CampaignResponse vote=true

# n5 wins the election and becomes leader.
stabilize heartbeat=true
---
n5@2 candidate ⇨ n5@2 leader
n5@2 append 2@2 None
n5@2 → n1 Append base=1@1 [2@2]
n5@2 → n2 Append base=1@1 [2@2]
n5@2 → n3 Append base=1@1 [2@2]
n5@2 → n4 Append base=1@1 [2@2]
n5@2 → n1 Heartbeat last_index=2 commit_index=1 read_seq=0
n5@2 → n2 Heartbeat last_index=2 commit_index=1 read_seq=0
n5@2 → n3 Heartbeat last_index=2 commit_index=1 read_seq=0
n5@2 → n4 Heartbeat last_index=2 commit_index=1 read_seq=0
n1@2 follower() ⇨ n1@2 follower(n5)
n1@2 append 2@2 None
n1@2 → n5 AppendResponse match_index=2
n1@2 → n5 HeartbeatResponse match_index=2 read_seq=0
n2@2 follower() ⇨ n2@2 follower(n5)
n2@2 append 2@2 None
n2@2 → n5 AppendResponse match_index=2
n2@2 → n5 HeartbeatResponse match_index=2 read_seq=0
n3@2 follower() ⇨ n3@2 follower(n5)
n3@2 append 2@2 None
n3@2 → n5 AppendResponse match_index=2
n3@2 → n5 HeartbeatResponse match_index=2 read_seq=0
n4@2 follower() ⇨ n4@2 follower(n5)
n4@2 append 2@2 None
n4@2 → n5 AppendResponse match_index=2
n4@2 → n5 HeartbeatResponse match_index=2 read_seq=0
n5@2 commit 2@2
n5@2 apply 2@2 None
n5@2 → n1 Heartbeat last_index=2 commit_index=2 read_seq=0
n5@2 → n2 Heartbeat last_index=2 commit_index=2 read_seq=0
n5@2 → n3 Heartbeat last_index=2 commit_index=2 read_seq=0
n5@2 → n4 Heartbeat last_index=2 commit_index=2 read_seq=0
n1@2 commit 2@2
n1@2 apply 2@2 None
n1@2 → n5 HeartbeatResponse match_index=2 read_seq=0
n2@2 commit 2@2
n2@2 apply 2@2 None
n2@2 → n5 HeartbeatResponse match_index=2 read_seq=0
n3@2 commit 2@2
n3@2 apply 2@2 None
n3@2 → n5 HeartbeatResponse match_index=2 read_seq=0
n4@2 commit 2@2
n4@2 apply 2@2 None
n4@2 → n5 HeartbeatResponse match_index=2 read_seq=0

# n1+n2's in-flight write at log position 2 has been replaced by the
# empty log entry appended by n5 when it became leader.
log 1 2
---
n1@2 term=2 last=2@2 commit=2@2 vote=None
n1@2 entry 1@1 None
n1@2 entry 2@2 None
n2@2 term=2 last=2@2 commit=2@2 vote=None
n2@2 entry 1@1 None
n2@2 entry 2@2 None

status
---
n1@2 follower(n5) last=2@2 commit=2@2 applied=2
n2@2 follower(n5) last=2@2 commit=2@2 applied=2
n3@2 follower(n5) last=2@2 commit=2@2 applied=2
n4@2 follower(n5) last=2@2 commit=2@2 applied=2
n5@2 leader last=2@2 commit=2@2 applied=2 progress={1:2→3 2:2→3 3:2→3 4:2→3}


================================================
FILE: src/raft/testscripts/node/election_candidate_behind_quorum
================================================
# A candidate that lags behind the quorum can't win an election.

cluster nodes=5 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1

# Partition n4+n5 away from the cluster.
partition 4 5
---
n4 n5 ⇹ n1 n2 n3

# Replicate a write on n1. n4+n5 now lag behind the quorum. Don't yet propagate
# the commit index to n2+n3, to make sure it won't grant the vote just because
# n5 is caught up to their local view of the commit index.
(put 1 foo=bar)
(stabilize)
status
---
n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3 4:1→3 5:1→3}
n2@1 follower(n1) last=2@1 commit=1@1 applied=1
n3@1 follower(n1) last=2@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1

# Heal the partition.
heal
---
n1 n2 n3 n4 n5 fully connected

# Make n5 campaign. n4 grants its vote, but the others reject it because it is
# behind the quorum. However, the term bump will convert the other nodes to
# leaderless followers.
heal
campaign 5
stabilize
---
n1 n2 n3 n4 n5 fully connected
n5@1 follower(n1) ⇨ n5@2 candidate
n5@2 → n1 Campaign last=1@1
n5@2 → n2 Campaign last=1@1
n5@2 → n3 Campaign last=1@1
n5@2 → n4 Campaign last=1@1
n1@1 leader ⇨ n1@2 follower()
n1@2 → n5 CampaignResponse vote=false
n2@1 follower(n1) ⇨ n2@2 follower()
n2@2 → n5 CampaignResponse vote=false
n3@1 follower(n1) ⇨ n3@2 follower()
n3@2 → n5 CampaignResponse vote=false
n4@1 follower(n1) ⇨ n4@2 follower()
n4@2 → n5 CampaignResponse vote=true

status
---
n1@2 follower() last=2@1 commit=2@1 applied=2
n2@2 follower() last=2@1 commit=1@1 applied=1
n3@2 follower() last=2@1 commit=1@1 applied=1
n4@2 follower() last=1@1 commit=1@1 applied=1
n5@2 candidate last=1@1 commit=1@1 applied=1

# n2 can campaign and win the election.
(campaign 2)
(stabilize heartbeat=true)
status
---
n1@3 follower(n2) last=3@3 commit=3@3 applied=3
n2@3 leader last=3@3 commit=3@3 applied=3 progress={1:3→4 3:3→4 4:3→4 5:3→4}
n3@3 follower(n2) last=3@3 commit=3@3 applied=3
n4@3 follower(n2) last=3@3 commit=3@3 applied=3
n5@3 follower(n2) last=3@3 commit=3@3 applied=3


================================================
FILE: src/raft/testscripts/node/election_contested
================================================
# A leader can be elected even when there are multiple candidates.

cluster nodes=5 election_timeout=2
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0
n4@0 follower() last=0@0 commit=0@0 applied=0
n5@0 follower() last=0@0 commit=0@0 applied=0

# n1 and n5 campaign.
tick
tick 1 5
---
n1@0 follower() ⇨ n1@1 candidate
n1@1 → n2 Campaign last=0@0
n1@1 → n3 Campaign last=0@0
n1@1 → n4 Campaign last=0@0
n1@1 → n5 Campaign last=0@0
n5@0 follower() ⇨ n5@1 candidate
n5@1 → n1 Campaign last=0@0
n5@1 → n2 Campaign last=0@0
n5@1 → n3 Campaign last=0@0
n5@1 → n4 Campaign last=0@0

# n1 and n5 ignore each other, since they're both campaigning.
deliver 1 5
---
n1@1 → n5 CampaignResponse vote=false
n5@1 → n1 CampaignResponse vote=false

# n1 reaches n2,n3 first, but n5 reaches n4 first.
deliver 2 3
deliver 4 from=5
deliver 4
---
n2@0 follower() ⇨ n2@1 follower()
n2@1 → n1 CampaignResponse vote=true
n2@1 → n5 CampaignResponse vote=false
n3@0 follower() ⇨ n3@1 follower()
n3@1 → n1 CampaignResponse vote=true
n3@1 → n5 CampaignResponse vote=false
n4@0 follower() ⇨ n4@1 follower()
n4@1 → n5 CampaignResponse vote=true
n4@1 → n1 CampaignResponse vote=false

# n1 and n5 receive their votes. n1 has quorum and becomes leader.
deliver
---
n1@1 candidate ⇨ n1@1 leader
n1@1 append 1@1 None
n1@1 → n2 Append base=0@0 [1@1]
n1@1 → n3 Append base=0@0 [1@1]
n1@1 → n4 Append base=0@0 [1@1]
n1@1 → n5 Append base=0@0 [1@1]
n1@1 → n2 Heartbeat last_index=1 commit_index=0 read_seq=0
n1@1 → n3 Heartbeat last_index=1 commit_index=0 read_seq=0
n1@1 → n4 Heartbeat last_index=1 commit_index=0 read_seq=0
n1@1 → n5 Heartbeat last_index=1 commit_index=0 read_seq=0

# All nodes accept n1 as leader in term 1 and become followers.
stabilize
---
n2@1 follower() ⇨ n2@1 follower(n1)
n2@1 append 1@1 None
n2@1 → n1 AppendResponse match_index=1
n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0
n3@1 follower() ⇨ n3@1 follower(n1)
n3@1 append 1@1 None
n3@1 → n1 AppendResponse match_index=1
n3@1 → n1 HeartbeatResponse match_index=1 read_seq=0
n4@1 follower() ⇨ n4@1 follower(n1)
n4@1 append 1@1 None
n4@1 → n1 AppendResponse match_index=1
n4@1 → n1 HeartbeatResponse match_index=1 read_seq=0
n5@1 candidate ⇨ n5@1 follower(n1)
n5@1 append 1@1 None
n5@1 → n1 AppendResponse match_index=1
n5@1 → n1 HeartbeatResponse match_index=1 read_seq=0
n1@1 commit 1@1
n1@1 apply 1@1 None

status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2}
n2@1 follower(n1) last=1@1 commit=0@0 applied=0
n3@1 follower(n1) last=1@1 commit=0@0 applied=0
n4@1 follower(n1) last=1@1 commit=0@0 applied=0
n5@1 follower(n1) last=1@1 commit=0@0 applied=0


================================================
FILE: src/raft/testscripts/node/election_tie
================================================
# No leader can be elected with an election tie.

cluster nodes=3 election_timeout=2
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0

# Tick all nodes twice to make them all campaign.
tick
tick
---
n1@0 follower() ⇨ n1@1 candidate
n1@1 → n2 Campaign last=0@0
n1@1 → n3 Campaign last=0@0
n2@0 follower() ⇨ n2@1 candidate
n2@1 → n1 Campaign last=0@0
n2@1 → n3 Campaign last=0@0
n3@0 follower() ⇨ n3@1 candidate
n3@1 → n1 Campaign last=0@0
n3@1 → n2 Campaign last=0@0

# Stabilizing the cluster will not result in a leader.
stabilize
---
n1@1 → n2 CampaignResponse vote=false
n1@1 → n3 CampaignResponse vote=false
n2@1 → n1 CampaignResponse vote=false
n2@1 → n3 CampaignResponse vote=false
n3@1 → n1 CampaignResponse vote=false
n3@1 → n2 CampaignResponse vote=false

status
---
n1@1 candidate last=0@0 commit=0@0 applied=0
n2@1 candidate last=0@0 commit=0@0 applied=0
n3@1 candidate last=0@0 commit=0@0 applied=0

# A node can call another election in a new term and win.
tick 2
tick 2
---
n2@1 candidate ⇨ n2@2 candidate
n2@2 → n1 Campaign last=0@0
n2@2 → n3 Campaign last=0@0

deliver
---
n1@1 candidate ⇨ n1@2 follower()
n1@2 → n2 CampaignResponse vote=true
n3@1 candidate ⇨ n3@2 follower()
n3@2 → n2 CampaignResponse vote=true

deliver
---
n2@2 candidate ⇨ n2@2 leader
n2@2 append 1@2 None
n2@2 → n1 Append base=0@0 [1@2]
n2@2 → n3 Append base=0@0 [1@2]
n2@2 → n1 Heartbeat last_index=1 commit_index=0 read_seq=0
n2@2 → n3 Heartbeat last_index=1 commit_index=0 read_seq=0

stabilize
---
n1@2 follower() ⇨ n1@2 follower(n2)
n1@2 append 1@2 None
n1@2 → n2 AppendResponse match_index=1
n1@2 → n2 HeartbeatResponse match_index=1 read_seq=0
n3@2 follower() ⇨ n3@2 follower(n2)
n3@2 append 1@2 None
n3@2 → n2 AppendResponse match_index=1
n3@2 → n2 HeartbeatResponse match_index=1 read_seq=0
n2@2 commit 1@2
n2@2 apply 1@2 None

status
---
n1@2 follower(n2) last=1@2 commit=0@0 applied=0
n2@2 leader last=1@2 commit=1@2 applied=1 progress={1:1→2 3:1→2}
n3@2 follower(n2) last=1@2 commit=0@0 applied=0


================================================
FILE: src/raft/testscripts/node/election_tie_even
================================================
# No leader can be elected with an election tie between an even number of nodes.

cluster nodes=4 election_timeout=2
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0
n4@0 follower() last=0@0 commit=0@0 applied=0

# n1 and n4 campaign.
tick
tick 1 4
---
n1@0 follower() ⇨ n1@1 candidate
n1@1 → n2 Campaign last=0@0
n1@1 → n3 Campaign last=0@0
n1@1 → n4 Campaign last=0@0
n4@0 follower() ⇨ n4@1 candidate
n4@1 → n1 Campaign last=0@0
n4@1 → n2 Campaign last=0@0
n4@1 → n3 Campaign last=0@0

# n2 votes for n1, n3 votes for n4.
deliver 2
deliver 3 from=4
deliver 3
---
n2@0 follower() ⇨ n2@1 follower()
n2@1 → n1 CampaignResponse vote=true
n2@1 → n4 CampaignResponse vote=false
n3@0 follower() ⇨ n3@1 follower()
n3@1 → n4 CampaignResponse vote=true
n3@1 → n1 CampaignResponse vote=false

# Stabilizing the cluster will not result in a leader.
stabilize
---
n1@1 → n4 CampaignResponse vote=false
n4@1 → n1 CampaignResponse vote=false

status
---
n1@1 candidate last=0@0 commit=0@0 applied=0
n2@1 follower() last=0@0 commit=0@0 applied=0
n3@1 follower() last=0@0 commit=0@0 applied=0
n4@1 candidate last=0@0 commit=0@0 applied=0

# A node can call another election in a new term and win.
tick 3
tick 3
---
n3@1 follower() ⇨ n3@2 candidate
n3@2 → n1 Campaign last=0@0
n3@2 → n2 Campaign last=0@0
n3@2 → n4 Campaign last=0@0

deliver
---
n1@1 candidate ⇨ n1@2 follower()
n1@2 → n3 CampaignResponse vote=true
n2@1 follower() ⇨ n2@2 follower()
n2@2 → n3 CampaignResponse vote=true
n4@1 candidate ⇨ n4@2 follower()
n4@2 → n3 CampaignResponse vote=true

deliver
---
n3@2 candidate ⇨ n3@2 leader
n3@2 append 1@2 None
n3@2 → n1 Append base=0@0 [1@2]
n3@2 → n2 Append base=0@0 [1@2]
n3@2 → n4 Append base=0@0 [1@2]
n3@2 → n1 Heartbeat last_index=1 commit_index=0 read_seq=0
n3@2 → n2 Heartbeat last_index=1 commit_index=0 read_seq=0
n3@2 → n4 Heartbeat last_index=1 commit_index=0 read_seq=0

stabilize
---
n1@2 follower() ⇨ n1@2 follower(n3)
n1@2 append 1@2 None
n1@2 → n3 AppendResponse match_index=1
n1@2 → n3 HeartbeatResponse match_index=1 read_seq=0
n2@2 follower() ⇨ n2@2 follower(n3)
n2@2 append 1@2 None
n2@2 → n3 AppendResponse match_index=1
n2@2 → n3 HeartbeatResponse match_index=1 read_seq=0
n4@2 follower() ⇨ n4@2 follower(n3)
n4@2 append 1@2 None
n4@2 → n3 AppendResponse match_index=1
n4@2 → n3 HeartbeatResponse match_index=1 read_seq=0
n3@2 commit 1@2
n3@2 apply 1@2 None

status
---
n1@2 follower(n3) last=1@2 commit=0@0 applied=0
n2@2 follower(n3) last=1@2 commit=0@0 applied=0
n3@2 leader last=1@2 commit=1@2 applied=1 progress={1:1→2 2:1→2 4:1→2}
n4@2 follower(n3) last=1@2 commit=0@0 applied=0


================================================
FILE: src/raft/testscripts/node/heartbeat_commits_follower
================================================
# A heartbeat will commit and apply an entry on a follower.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Write on the leader, which replicates then commits and applies locally.
put 1 foo=bar
stabilize
---
c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172
n1@1 append 2@1 put foo=bar
n1@1 → n2 Append base=1@1 [2@1]
n1@1 → n3 Append base=1@1 [2@1]
n2@1 append 2@1 put foo=bar
n2@1 → n1 AppendResponse match_index=2
n3@1 append 2@1 put foo=bar
n3@1 → n1 AppendResponse match_index=2
n1@1 commit 2@1
n1@1 apply 2@1 put foo=bar
n1@1 → c1 ClientResponse id=0x01 write 0x0102
c1@1 put foo=bar ⇒ 2

# The write has been replicated, but not yet committed and applied on followers.
status
---
n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3}
n2@1 follower(n1) last=2@1 commit=1@1 applied=1
n3@1 follower(n1) last=2@1 commit=1@1 applied=1

# A heartbeat commits and applies on followers.
heartbeat 1
stabilize
---
n1@1 → n2 Heartbeat last_index=2 commit_index=2 read_seq=0
n1@1 → n3 Heartbeat last_index=2 commit_index=2 read_seq=0
n2@1 commit 2@1
n2@1 apply 2@1 put foo=bar
n2@1 → n1 HeartbeatResponse match_index=2 read_seq=0
n3@1 commit 2@1
n3@1 apply 2@1 put foo=bar
n3@1 → n1 HeartbeatResponse match_index=2 read_seq=0

status
---
n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3}
n2@1 follower(n1) last=2@1 commit=2@1 applied=2
n3@1 follower(n1) last=2@1 commit=2@1 applied=2


================================================
FILE: src/raft/testscripts/node/heartbeat_converts_candidate
================================================
# A heartbeat from a leader should convert a candidate in the same term to a
# follower.

cluster nodes=3
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0

# Partition n3 away from the cluster.
partition 3
---
n3 ⇹ n1 n2

# Both n1 and n3 campaign. n2 votes for n1.
campaign 1 3
deliver
---
n1@0 follower() ⇨ n1@1 candidate
n1@1 → n2 Campaign last=0@0
n1@1 ⇥ n3 C̶a̶m̶p̶a̶i̶g̶n̶ ̶l̶a̶s̶t̶=̶0̶@̶0̶
n3@0 follower() ⇨ n3@1 candidate
n3@1 ⇥ n1 C̶a̶m̶p̶a̶i̶g̶n̶ ̶l̶a̶s̶t̶=̶0̶@̶0̶
n3@1 ⇥ n2 C̶a̶m̶p̶a̶i̶g̶n̶ ̶l̶a̶s̶t̶=̶0̶@̶0̶
n2@0 follower() ⇨ n2@1 follower()
n2@1 → n1 CampaignResponse vote=true

# n1 assumes leadership and heartbeats, committing entry 1.
(stabilize heartbeat=true)
status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:0→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 candidate last=0@0 commit=0@0 applied=0

# Heal the partition.
heal
---
n1 n2 n3 fully connected

# The next heartbeat from n1 converts n3 to a follower in term 1.
heartbeat 1
stabilize
---
n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0
n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0
n3@1 candidate ⇨ n3@1 follower(n1)
n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0
n1@1 → n3 Append base=0@0 [1@1]
n3@1 append 1@1 None
n3@1 → n1 AppendResponse match_index=1

status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=0@0 applied=0


================================================
FILE: src/raft/testscripts/node/heartbeat_converts_follower
================================================
# A heartbeat from a leader should convert a follower of a different leader in a
# past term to a follower.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Partition n2 away from the cluster.
partition 2
---
n2 ⇹ n1 n3

# Elect n3 as a new leader.
(campaign 3)
(stabilize heartbeat=true)
status
---
n1@2 follower(n3) last=2@2 commit=2@2 applied=2
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@2 leader last=2@2 commit=2@2 applied=2 progress={1:2→3 2:0→3}

# Heal the partition.
heal
---
n1 n2 n3 fully connected

# The next heartbeat from n3 converts n2 to a follower in term 2.
heartbeat 3
stabilize heartbeat=true
---
n3@2 → n1 Heartbeat last_index=2 commit_index=2 read_seq=0
n3@2 → n2 Heartbeat last_index=2 commit_index=2 read_seq=0
n1@2 → n3 HeartbeatResponse match_index=2 read_seq=0
n2@1 follower(n1) ⇨ n2@2 follower(n3)
n2@2 → n3 HeartbeatResponse match_index=0 read_seq=0
n3@2 → n2 Append base=1@1 []
n2@2 → n3 AppendResponse match_index=1
n3@2 → n2 Append base=1@1 [2@2]
n2@2 append 2@2 None
n2@2 → n3 AppendResponse match_index=2
n3@2 → n1 Heartbeat last_index=2 commit_index=2 read_seq=0
n3@2 → n2 Heartbeat last_index=2 commit_index=2 read_seq=0
n1@2 → n3 HeartbeatResponse match_index=2 read_seq=0
n2@2 commit 2@2
n2@2 apply 2@2 None
n2@2 → n3 HeartbeatResponse match_index=2 read_seq=0

status
---
n1@2 follower(n3) last=2@2 commit=2@2 applied=2
n2@2 follower(n3) last=2@2 commit=2@2 applied=2
n3@2 leader last=2@2 commit=2@2 applied=2 progress={1:2→3 2:2→3}


================================================
FILE: src/raft/testscripts/node/heartbeat_converts_follower_leaderless
================================================
# A heartbeat from a leader should convert a leaderless follower.

cluster nodes=3
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0

# Partition n3 away from the cluster.
partition 3
---
n3 ⇹ n1 n2

# Elect n1 as a new leader.
(campaign 1)
(stabilize heartbeat=true)
status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:0→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@0 follower() last=0@0 commit=0@0 applied=0

# Heal the partition.
heal
---
n1 n2 n3 fully connected

# The next heartbeat from n1 converts n3 to a follower in term 1.
heartbeat 1
stabilize
---
n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0
n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0
n3@0 follower() ⇨ n3@1 follower(n1)
n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0
n1@1 → n3 Append base=0@0 [1@1]
n3@1 append 1@1 None
n3@1 → n1 AppendResponse match_index=1

status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=0@0 applied=0


================================================
FILE: src/raft/testscripts/node/heartbeat_converts_leader
================================================
# A heartbeat from a leader should convert a leader in a past term to a
# follower.

cluster nodes=3 leader=3
---
n1@1 follower(n3) last=1@1 commit=1@1 applied=1
n2@1 follower(n3) last=1@1 commit=1@1 applied=1
n3@1 leader last=1@1 commit=1@1 applied=1 progress={1:1→2 2:1→2}

# Partition n3 away from the cluster.
partition 3
---
n3 ⇹ n1 n2

# Elect n1 as a new leader.
(campaign 1)
(stabilize heartbeat=true)
status
---
n1@2 leader last=2@2 commit=2@2 applied=2 progress={2:2→3 3:0→3}
n2@2 follower(n1) last=2@2 commit=2@2 applied=2
n3@1 leader last=1@1 commit=1@1 applied=1 progress={1:1→2 2:1→2}

# Heal the partition.
heal
---
n1 n2 n3 fully connected

# The next heartbeat from n1 converts n3 to a follower in term 2.
heartbeat 1
stabilize heartbeat=true
---
n1@2 → n2 Heartbeat last_index=2 commit_index=2 read_seq=0
n1@2 → n3 Heartbeat last_index=2 commit_index=2 read_seq=0
n2@2 → n1 HeartbeatResponse match_index=2 read_seq=0
n3@1 leader ⇨ n3@2 follower(n1)
n3@2 → n1 HeartbeatResponse match_index=0 read_seq=0
n1@2 → n3 Append base=1@1 []
n3@2 → n1 AppendResponse match_index=1
n1@2 → n3 Append base=1@1 [2@2]
n3@2 append 2@2 None
n3@2 → n1 AppendResponse match_index=2
n1@2 → n2 Heartbeat last_index=2 commit_index=2 read_seq=0
n1@2 → n3 Heartbeat last_index=2 commit_index=2 read_seq=0
n2@2 → n1 HeartbeatResponse match_index=2 read_seq=0
n3@2 commit 2@2
n3@2 apply 2@2 None
n3@2 → n1 HeartbeatResponse match_index=2 read_seq=0

status
---
n1@2 leader last=2@2 commit=2@2 applied=2 progress={2:2→3 3:2→3}
n2@2 follower(n1) last=2@2 commit=2@2 applied=2
n3@2 follower(n1) last=2@2 commit=2@2 applied=2


================================================
FILE: src/raft/testscripts/node/heartbeat_lost_append_duplicate
================================================
# Duplicate heartbeats and responses with a lost append will
# trigger duplicate resends, but it will eventually resolve.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Partition the leader, submit a write whose appends are dropped,
# then heal the partition again.
partition 1
---
n1 ⇹ n2 n3

put 1 foo=bar
---
c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172
n1@1 append 2@1 put foo=bar
n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶
n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶

heal
---
n1 n2 n3 fully connected

# The next heartbeat will result in match_index=0 since the followers
# don't have the last_index. 3 heartbeats are made.
heartbeat 1
heartbeat 1
heartbeat 1
deliver
---
n1@1 → n2 Heartbeat last_index=2 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=2 commit_index=1 read_seq=0
n1@1 → n2 Heartbeat last_index=2 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=2 commit_index=1 read_seq=0
n1@1 → n2 Heartbeat last_index=2 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=2 commit_index=1 read_seq=0
n2@1 → n1 HeartbeatResponse match_index=0 read_seq=0
n2@1 → n1 HeartbeatResponse match_index=0 read_seq=0
n2@1 → n1 HeartbeatResponse match_index=0 read_seq=0
n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0
n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0
n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0

# The leader has previously matched the followers at index 1.
status 1
---
n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:1→3 3:1→3}

# When it receives the heartbeat responses, it sends duplicates of the missing
# entries.
deliver
---
n1@1 → n2 Append base=1@1 [2@1]
n1@1 → n2 Append base=1@1 [2@1]
n1@1 → n2 Append base=1@1 [2@1]
n1@1 → n3 Append base=1@1 [2@1]
n1@1 → n3 Append base=1@1 [2@1]
n1@1 → n3 Append base=1@1 [2@1]

# The followers accept the duplicate appends and the leader commits and applies.
stabilize
---
n2@1 append 2@1 put foo=bar
n2@1 → n1 AppendResponse match_index=2
n2@1 → n1 AppendResponse match_index=2
n2@1 → n1 AppendResponse match_index=2
n3@1 append 2@1 put foo=bar
n3@1 → n1 AppendResponse match_index=2
n3@1 → n1 AppendResponse match_index=2
n3@1 → n1 AppendResponse match_index=2
n1@1 commit 2@1
n1@1 apply 2@1 put foo=bar
n1@1 → c1 ClientResponse id=0x01 write 0x0102
c1@1 put foo=bar ⇒ 2


================================================
FILE: src/raft/testscripts/node/heartbeat_lost_append_multiple
================================================
# A heartbeat response triggers a probe and resend of lost appends.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Partition the leader, submit three writes whose appends are dropped, then heal
# the partition again.
partition 1
---
n1 ⇹ n2 n3

put 1 a=1
put 1 b=2
put 1 c=3
---
c1@1 → n1 ClientRequest id=0x01 write 0x0101610131
n1@1 append 2@1 put a=1
n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶
n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶
c1@1 → n1 ClientRequest id=0x02 write 0x0101620132
n1@1 append 3@1 put b=2
n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶2̶@̶1̶ ̶[̶3̶@̶1̶]̶
n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶2̶@̶1̶ ̶[̶3̶@̶1̶]̶
c1@1 → n1 ClientRequest id=0x03 write 0x0101630133
n1@1 append 4@1 put c=3
n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶1̶]̶
n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶1̶]̶

heal
status
---
n1 n2 n3 fully connected
n1@1 leader last=4@1 commit=1@1 applied=1 progress={2:1→5 3:1→5}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# The next heartbeat will result in match_index=0 since the followers
# don't have the last_index.
heartbeat 1
deliver
---
n1@1 → n2 Heartbeat last_index=4 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=4 commit_index=1 read_seq=0
n2@1 → n1 HeartbeatResponse match_index=0 read_seq=0
n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0

# The leader has previously matched the followers at index 1.
status 1
---
n1@1 leader last=4@1 commit=1@1 applied=1 progress={2:1→5 3:1→5}

# When it receives the heartbeat response, it probes the previous index 3.
deliver
---
n1@1 → n2 Append base=3@1 []
n1@1 → n3 Append base=3@1 []

# The followers don't have index 3. They don't have index 2 either, but they
# do have 1, so they respond with a reject_index=2.
deliver
---
n2@1 → n1 AppendResponse reject_index=2
n3@1 → n1 AppendResponse reject_index=2

# The leader has already matched index 1, so it doesn't have to probe for it,
# and can simply send the tail of the log.
deliver
---
n1@1 → n2 Append base=1@1 [2@1 3@1 4@1]
n1@1 → n3 Append base=1@1 [2@1 3@1 4@1]

# The followers accept the append and the leader commits and applies.
stabilize
---
n2@1 append 2@1 put a=1
n2@1 append 3@1 put b=2
n2@1 append 4@1 put c=3
n2@1 → n1 AppendResponse match_index=4
n3@1 append 2@1 put a=1
n3@1 append 3@1 put b=2
n3@1 append 4@1 put c=3
n3@1 → n1 AppendResponse match_index=4
n1@1 commit 4@1
n1@1 apply 2@1 put a=1
n1@1 apply 3@1 put b=2
n1@1 apply 4@1 put c=3
n1@1 → c1 ClientResponse id=0x01 write 0x0102
c1@1 put a=1 ⇒ 2
n1@1 → c1 ClientResponse id=0x02 write 0x0103
c1@1 put b=2 ⇒ 3
n1@1 → c1 ClientResponse id=0x03 write 0x0104
c1@1 put c=3 ⇒ 4


================================================
FILE: src/raft/testscripts/node/heartbeat_lost_append_single
================================================
# A heartbeat response triggers a resend of a lost append.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Partition the leader, submit a write whose appends are dropped,
# then heal the partition again.
partition 1
---
n1 ⇹ n2 n3

put 1 foo=bar
---
c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172
n1@1 append 2@1 put foo=bar
n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶
n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶

heal
---
n1 n2 n3 fully connected

# The next heartbeat will result in match_index=0 since the followers
# don't have the last_index.
heartbeat 1
deliver
---
n1@1 → n2 Heartbeat last_index=2 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=2 commit_index=1 read_seq=0
n2@1 → n1 HeartbeatResponse match_index=0 read_seq=0
n3@1 → n1 HeartbeatResponse match_index=0 read_seq=0

# The leader has previously matched the followers at index 1.
status 1
---
n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:1→3 3:1→3}

# When it receives the heartbeat response, instead of probing index 1 and then
# sending the actual entries, it simply sends the entries.
deliver
---
n1@1 → n2 Append base=1@1 [2@1]
n1@1 → n3 Append base=1@1 [2@1]

# The followers accept the append and the leader commits and applies.
stabilize
---
n2@1 append 2@1 put foo=bar
n2@1 → n1 AppendResponse match_index=2
n3@1 append 2@1 put foo=bar
n3@1 → n1 AppendResponse match_index=2
n1@1 commit 2@1
n1@1 apply 2@1 put foo=bar
n1@1 → c1 ClientResponse id=0x01 write 0x0102
c1@1 put foo=bar ⇒ 2


================================================
FILE: src/raft/testscripts/node/heartbeat_lost_read
================================================
# Heartbeats will recover from a lost read message.

cluster nodes=5 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1

# Write a key and replicate it.
(put 1 foo=bar)
(stabilize heartbeat=true)
---
ok

# Partition the leader, and submit a read.
partition 1
---
n1 ⇹ n2 n3 n4 n5

get 1 foo
---
c1@1 → n1 ClientRequest id=0x02 read 0x0003666f6f
n1@1 ⇥ n2 R̶e̶a̶d̶ ̶s̶e̶q̶=̶1̶
n1@1 ⇥ n3 R̶e̶a̶d̶ ̶s̶e̶q̶=̶1̶
n1@1 ⇥ n4 R̶e̶a̶d̶ ̶s̶e̶q̶=̶1̶
n1@1 ⇥ n5 R̶e̶a̶d̶ ̶s̶e̶q̶=̶1̶

heal
---
n1 n2 n3 n4 n5 fully connected

# The next heartbeat will detect the failed read, and serve it when
# it has a quorum.
heartbeat 1
deliver
---
n1@1 → n2 Heartbeat last_index=2 commit_index=2 read_seq=1
n1@1 → n3 Heartbeat last_index=2 commit_index=2 read_seq=1
n1@1 → n4 Heartbeat last_index=2 commit_index=2 read_seq=1
n1@1 → n5 Heartbeat last_index=2 commit_index=2 read_seq=1
n2@1 → n1 HeartbeatResponse match_index=2 read_seq=1
n3@1 → n1 HeartbeatResponse match_index=2 read_seq=1
n4@1 → n1 HeartbeatResponse match_index=2 read_seq=1
n5@1 → n1 HeartbeatResponse match_index=2 read_seq=1

# The first response does not provide quorum.
deliver 1 from=2
---
ok

# The second does, and the read is served.
deliver 1 from=3
---
n1@1 → c1 ClientResponse id=0x02 read 0x000103626172
c1@1 get foo ⇒ bar


================================================
FILE: src/raft/testscripts/node/heartbeat_match_commits
================================================
# A heartbeat response can advance a follower match index and commit+apply.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Submit a write to the leader.
put 1 foo=bar
---
c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172
n1@1 append 2@1 put foo=bar
n1@1 → n2 Append base=1@1 [2@1]
n1@1 → n3 Append base=1@1 [2@1]

# Partition n1 away from the followers as they send the append acks, then heal
# the partition.
partition 1
---
n1 ⇹ n2 n3

stabilize
---
n2@1 append 2@1 put foo=bar
n2@1 ⇥ n1 A̶p̶p̶e̶n̶d̶R̶e̶s̶p̶o̶n̶s̶e̶ ̶m̶a̶t̶c̶h̶_̶i̶n̶d̶e̶x̶=̶2̶
n3@1 append 2@1 put foo=bar
n3@1 ⇥ n1 A̶p̶p̶e̶n̶d̶R̶e̶s̶p̶o̶n̶s̶e̶ ̶m̶a̶t̶c̶h̶_̶i̶n̶d̶e̶x̶=̶2̶

heal
---
n1 n2 n3 fully connected

# The write has been replicated, but not yet committed and applied.
status
---
n1@1 leader last=2@1 commit=1@1 applied=1 progress={2:1→3 3:1→3}
n2@1 follower(n1) last=2@1 commit=1@1 applied=1
n3@1 follower(n1) last=2@1 commit=1@1 applied=1

# The leader heartbeats. The followers confirm they are caught up.
heartbeat 1
deliver
---
n1@1 → n2 Heartbeat last_index=2 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=2 commit_index=1 read_seq=0
n2@1 → n1 HeartbeatResponse match_index=2 read_seq=0
n3@1 → n1 HeartbeatResponse match_index=2 read_seq=0

# When the leader receives the first heartbeat, it commits and applies
# the write.
deliver 1 from=2
---
n1@1 commit 2@1
n1@1 apply 2@1 put foo=bar
n1@1 → c1 ClientResponse id=0x01 write 0x0102
c1@1 put foo=bar ⇒ 2

status 1
---
n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:1→3}

# Delivery of the second heartbeat advances the match index, but
# there is nothing more to do.
deliver
status
---
n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3}
n2@1 follower(n1) last=2@1 commit=1@1 applied=1
n3@1 follower(n1) last=2@1 commit=1@1 applied=1


================================================
FILE: src/raft/testscripts/node/heartbeat_multiple_leaders_panic
================================================
# A heartbeat will panic if there are multiple leaders in a term.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Leader panics if it sees another leader in the same term.
!step 1 '{"from":2, "to":1, "term":1, "message": {"Heartbeat":{"last_index":1,"commit_index":0, "commit_term":0, "read_seq":0}}}'
---
Panic: saw other leader 2 in term 1

# Follower panics too.
!step 2 '{"from":3, "to":2, "term":1, "message": {"Heartbeat":{"last_index":1,"commit_index":0, "commit_term":0, "read_seq":0}}}'
---
Panic: assertion `left == right` failed: multiple leaders in term
  left: 3
 right: 1


================================================
FILE: src/raft/testscripts/node/heartbeat_old_commit_index
================================================
# A heartbeat with an old commit index is ignored by a follower.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Replicate a write.
(put 1 foo=bar)
(stabilize heartbeat=true)
status
---
n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3}
n2@1 follower(n1) last=2@1 commit=2@1 applied=2
n3@1 follower(n1) last=2@1 commit=2@1 applied=2

# Step a heartbeat with an outdated commit index.
step 2 '{"from":1, "to":2, "term":1, "message":{"Heartbeat":{"last_index":2,"commit_index":1,"commit_term":1,"read_seq":0}}}'
stabilize
---
n2@1 → n1 HeartbeatResponse match_index=2 read_seq=0


================================================
FILE: src/raft/testscripts/node/heartbeat_old_last_index
================================================
# A heartbeat with an old last index is matched by a follower.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Replicate a write.
(put 1 foo=bar)
(stabilize heartbeat=true)
status
---
n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3}
n2@1 follower(n1) last=2@1 commit=2@1 applied=2
n3@1 follower(n1) last=2@1 commit=2@1 applied=2

# Step a heartbeat with an outdated last index.
step 2 '{"from":1, "to":2, "term":1, "message":{"Heartbeat":{"last_index":1,"commit_index":1,"commit_term":1,"read_seq":0}}}'
stabilize
---
n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0


================================================
FILE: src/raft/testscripts/node/heartbeat_probe_divergent
================================================
# A heartbeat while the leader is probing a follower with a long divergent tail
# doesn't disrupt the probing, and won't result in a quadratically increasing
# amount of probes with each heartbeat.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Make a couple of writes to ensure a common log prefix.
(put 1 a=1)
(put 1 b=2)
(stabilize)
status
---
n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4}
n2@1 follower(n1) last=3@1 commit=1@1 applied=1
n3@1 follower(n1) last=3@1 commit=1@1 applied=1

# Partition n1
partition 1
---
n1 ⇹ n2 n3

# Elect new leaders in the majority partition and replicate a few writes.
(campaign 2)
(stabilize)
(put 2 c=3)
(put 2 d=4)
(stabilize heartbeat=true)
status
---
n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4}
n2@2 leader last=6@2 commit=6@2 applied=6 progress={1:0→7 3:6→7}
n3@2 follower(n2) last=6@2 commit=6@2 applied=6

(campaign 3)
(stabilize)
(put 2 e=5)
(put 2 f=6)
(stabilize heartbeat=true)
status
---
n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4}
n2@3 follower(n3) last=9@3 commit=9@3 applied=9
n3@3 leader last=9@3 commit=9@3 applied=9 progress={1:0→10 2:9→10}

# Propose writes in the minority partition as well, to build up a divergent log.
(put 1 a=2)
(put 1 a=3)
(put 1 a=4)
(put 1 a=5)
(put 1 a=6)
(put 1 a=7)
(put 1 a=8)
(put 1 a=9)
(stabilize)
status
---
n1@1 leader last=11@1 commit=3@1 applied=3 progress={2:3→12 3:3→12}
n2@3 follower(n3) last=9@3 commit=9@3 applied=9
n3@3 leader last=9@3 commit=9@3 applied=9 progress={1:0→10 2:9→10}

# Heal the partition.
heal
---
n1 n2 n3 fully connected

# Propose another write on the majority leader to start probing.
put 3 g=7
---
c3@3 → n3 ClientRequest id=0x0f write 0x0101670137
n3@3 append 10@3 put g=7
n3@3 → n1 Append base=9@3 [10@3]
n3@3 → n2 Append base=9@3 [10@3]

# The append should be rejected by n1, canceling the writes.
deliver 1
---
n1@1 leader ⇨ n1@3 follower(n3)
n1@1 → c1 ClientResponse id=0x07 Error::Abort
c1@1 put a=2 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x08 Error::Abort
c1@1 put a=3 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x09 Error::Abort
c1@1 put a=4 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x0a Error::Abort
c1@1 put a=5 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x0b Error::Abort
c1@1 put a=6 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x0c Error::Abort
c1@1 put a=7 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x0d Error::Abort
c1@1 put a=8 ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x0e Error::Abort
c1@1 put a=9 ⇒ Error::Abort (operation aborted)
n1@3 → n3 AppendResponse reject_index=9

# n3 begins probing, and also heartbeats.
deliver 3
heartbeat 3
deliver 1
status 3
---
n3@3 → n1 Append base=8@3 []
n3@3 → n1 Heartbeat last_index=10 commit_index=9 read_seq=0
n3@3 → n2 Heartbeat last_index=10 commit_index=9 read_seq=0
n1@3 → n3 AppendResponse reject_index=8
n1@3 → n3 HeartbeatResponse match_index=0 read_seq=0
n3@3 leader last=10@3 commit=9@3 applied=9 progress={1:0→9 2:9→11}

# n3 receives probe and heartbeat responses, resulting in duplicate
# probes being sent at base index 7.
deliver 3
status 3
---
n3@3 → n1 Append base=7@3 []
n3@3 → n1 Append base=7@3 []
n3@3 leader last=10@3 commit=9@3 applied=9 progress={1:0→8 2:9→11}

deliver 1
---
n1@3 → n3 AppendResponse reject_index=7
n1@3 → n3 AppendResponse reject_index=7

# However, when receiving the duplicate probe responses, they are
# deduplicated and only a single new probe is sent.
deliver 3
---
n3@3 → n1 Append base=6@2 []

deliver 1
---
n1@3 → n3 AppendResponse reject_index=6

# n3 heartbeats again before sending the next probe. This results in
# two probes: the heartbeat response resends the probe at base 5, while
# the probe response triggers a new probe at base 4.
heartbeat 3
deliver 3
---
n3@3 → n1 Heartbeat last_index=10 commit_index=9 read_seq=0
n3@3 → n2 Heartbeat last_index=10 commit_index=9 read_seq=0
n3@3 → n1 Append base=5@2 []

deliver 1
---
n1@3 → n3 HeartbeatResponse match_index=0 read_seq=0
n1@3 → n3 AppendResponse reject_index=5

deliver 3
---
n3@3 → n1 Append base=5@2 []
n3@3 → n1 Append base=4@2 []

deliver 1
---
n1@3 → n3 AppendResponse reject_index=5
n1@3 → n3 AppendResponse reject_index=4

# The probe response at reject_index=5 is ignored, since we're already probed
# it. Only a single new probe is sent at base 4.
deliver 3
---
n3@3 → n1 Append base=3@1 []

# When delivered, we finally get a match, and the follower gets caught up.
deliver 1
---
n1@3 → n3 AppendResponse match_index=3

deliver 3
---
n3@3 → n1 Append base=3@1 [4@2 5@2 6@2 7@3 8@3 9@3 10@3]

deliver 1
---
n1@3 append 4@2 None
n1@3 append 5@2 put c=3
n1@3 append 6@2 put d=4
n1@3 append 7@3 None
n1@3 append 8@3 put e=5
n1@3 append 9@3 put f=6
n1@3 append 10@3 put g=7
n1@3 → n3 AppendResponse match_index=10

deliver 3
---
n3@3 commit 10@3
n3@3 apply 10@3 put g=7
n3@3 → c3 ClientResponse id=0x0f write 0x010a
c3@3 put g=7 ⇒ 10


================================================
FILE: src/raft/testscripts/node/old_campaign_rejected
================================================
# Old campaign messages (in the same term) are ignored by leaders and followers
# once a leader is elected.

cluster nodes=3
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0

# n1 and n2 campaign.
campaign 1 2
---
n1@0 follower() ⇨ n1@1 candidate
n1@1 → n2 Campaign last=0@0
n1@1 → n3 Campaign last=0@0
n2@0 follower() ⇨ n2@1 candidate
n2@1 → n1 Campaign last=0@0
n2@1 → n3 Campaign last=0@0

# n3 receives n1's Campaign message and grants its vote.
deliver 3 from=1
---
n3@0 follower() ⇨ n3@1 follower()
n3@1 → n1 CampaignResponse vote=true

# n1 becomes leader.
deliver 1 from=3
---
n1@1 candidate ⇨ n1@1 leader
n1@1 append 1@1 None
n1@1 → n2 Append base=0@0 [1@1]
n1@1 → n3 Append base=0@0 [1@1]
n1@1 → n2 Heartbeat last_index=1 commit_index=0 read_seq=0
n1@1 → n3 Heartbeat last_index=1 commit_index=0 read_seq=0

# n3 receives n1's heartbeat and becomes follower.
deliver 3 from=1
---
n3@1 follower() ⇨ n3@1 follower(n1)
n3@1 append 1@1 None
n3@1 → n1 AppendResponse match_index=1
n3@1 → n1 HeartbeatResponse match_index=1 read_seq=0

status
---
n1@1 leader last=1@1 commit=0@0 applied=0 progress={2:0→2 3:0→2}
n2@1 candidate last=0@0 commit=0@0 applied=0
n3@1 follower(n1) last=1@1 commit=0@0 applied=0

# n1 and n3 receive n2's Campaign message and reject it.
deliver 1 3 from=2
---
n1@1 → n2 CampaignResponse vote=false
n3@1 → n2 CampaignResponse vote=false

# Stabilizing the cluster results in everyone following n1.
(stabilize heartbeat=true)
status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1


================================================
FILE: src/raft/testscripts/node/old_campaign_response_ignored
================================================
# Old campaign responses (in the same term) are ignored by leaders and followers
# once a leader is elected.

cluster nodes=7
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0
n4@0 follower() last=0@0 commit=0@0 applied=0
n5@0 follower() last=0@0 commit=0@0 applied=0
n6@0 follower() last=0@0 commit=0@0 applied=0
n7@0 follower() last=0@0 commit=0@0 applied=0

# n1 and n2 campaign.
campaign 1 2
---
n1@0 follower() ⇨ n1@1 candidate
n1@1 → n2 Campaign last=0@0
n1@1 → n3 Campaign last=0@0
n1@1 → n4 Campaign last=0@0
n1@1 → n5 Campaign last=0@0
n1@1 → n6 Campaign last=0@0
n1@1 → n7 Campaign last=0@0
n2@0 follower() ⇨ n2@1 candidate
n2@1 → n1 Campaign last=0@0
n2@1 → n3 Campaign last=0@0
n2@1 → n4 Campaign last=0@0
n2@1 → n5 Campaign last=0@0
n2@1 → n6 Campaign last=0@0
n2@1 → n7 Campaign last=0@0

# n3-n6 vote for n1, n7 votes for n2.
deliver 3 4 5 6 from=1
deliver 7 from=2
---
n3@0 follower() ⇨ n3@1 follower()
n3@1 → n1 CampaignResponse vote=true
n4@0 follower() ⇨ n4@1 follower()
n4@1 → n1 CampaignResponse vote=true
n5@0 follower() ⇨ n5@1 follower()
n5@1 → n1 CampaignResponse vote=true
n6@0 follower() ⇨ n6@1 follower()
n6@1 → n1 CampaignResponse vote=true
n7@0 follower() ⇨ n7@1 follower()
n7@1 → n2 CampaignResponse vote=true

# n1 receives votes from n3-n5 and assumes leadership.
deliver 1 from=3
deliver 1 from=4
deliver 1 from=5
---
n1@1 candidate ⇨ n1@1 leader
n1@1 append 1@1 None
n1@1 → n2 Append base=0@0 [1@1]
n1@1 → n3 Append base=0@0 [1@1]
n1@1 → n4 Append base=0@0 [1@1]
n1@1 → n5 Append base=0@0 [1@1]
n1@1 → n6 Append base=0@0 [1@1]
n1@1 → n7 Append base=0@0 [1@1]
n1@1 → n2 Heartbeat last_index=1 commit_index=0 read_seq=0
n1@1 → n3 Heartbeat last_index=1 commit_index=0 read_seq=0
n1@1 → n4 Heartbeat last_index=1 commit_index=0 read_seq=0
n1@1 → n5 Heartbeat last_index=1 commit_index=0 read_seq=0
n1@1 → n6 Heartbeat last_index=1 commit_index=0 read_seq=0
n1@1 → n7 Heartbeat last_index=1 commit_index=0 read_seq=0

# n2 receives n1's heartbeats and becomes follower.
deliver 2 from=1
---
n2@1 → n1 CampaignResponse vote=false
n2@1 candidate ⇨ n2@1 follower(n1)
n2@1 append 1@1 None
n2@1 → n1 AppendResponse match_index=1
n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0

# n1 (leader) receives n6's vote and ignores it. n2 (follower) receives n7's
# vote and ignores it. They remain leader and follower.
deliver 1 from=6
deliver 2 from=7
status
---
n1@1 leader last=1@1 commit=0@0 applied=0 progress={2:0→2 3:0→2 4:0→2 5:0→2 6:0→2 7:0→2}
n2@1 follower(n1) last=1@1 commit=0@0 applied=0
n3@1 follower() last=0@0 commit=0@0 applied=0
n4@1 follower() last=0@0 commit=0@0 applied=0
n5@1 follower() last=0@0 commit=0@0 applied=0
n6@1 follower() last=0@0 commit=0@0 applied=0
n7@1 follower() last=0@0 commit=0@0 applied=0

# Stabilizing the cluster results in everyone following n1.
(stabilize heartbeat=true)
status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2 6:1→2 7:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1
n6@1 follower(n1) last=1@1 commit=1@1 applied=1
n7@1 follower(n1) last=1@1 commit=1@1 applied=1


================================================
FILE: src/raft/testscripts/node/old_heartbeat_ignored
================================================
# A heartbeat from an old leader should be ignored.

# Make n3 leader.
cluster nodes=3 leader=3
---
n1@1 follower(n3) last=1@1 commit=1@1 applied=1
n2@1 follower(n3) last=1@1 commit=1@1 applied=1
n3@1 leader last=1@1 commit=1@1 applied=1 progress={1:1→2 2:1→2}

# Partition n3 away from the cluster.
partition 3
---
n3 ⇹ n1 n2

# Elect n1 as a new leader.
(campaign 1)
(stabilize heartbeat=true)
status
---
n1@2 leader last=2@2 commit=2@2 applied=2 progress={2:2→3 3:0→3}
n2@2 follower(n1) last=2@2 commit=2@2 applied=2
n3@1 leader last=1@1 commit=1@1 applied=1 progress={1:1→2 2:1→2}

# Heal the partition.
heal
---
n1 n2 n3 fully connected

# The next heartbeat from n3 is ignored.
heartbeat 3
stabilize
---
n3@1 → n1 Heartbeat last_index=1 commit_index=1 read_seq=0
n3@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0

status
---
n1@2 leader last=2@2 commit=2@2 applied=2 progress={2:2→3 3:0→3}
n2@2 follower(n1) last=2@2 commit=2@2 applied=2
n3@1 leader last=1@1 commit=1@1 applied=1 progress={1:1→2 2:1→2}


================================================
FILE: src/raft/testscripts/node/request_candidate_abort
================================================
# Client read/write requests fail on candidates.

cluster nodes=3
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0

# n1 campaigns.
campaign 1
---
n1@0 follower() ⇨ n1@1 candidate
n1@1 → n2 Campaign last=0@0
n1@1 → n3 Campaign last=0@0

# A read request on n1 should be rejected.
get 1 foo
---
c1@1 → n1 ClientRequest id=0x01 read 0x0003666f6f
n1@1 → c1 ClientResponse id=0x01 Error::Abort
c1@1 get foo ⇒ Error::Abort (operation aborted)

# A write request on n1 should be rejected.
put 1 foo=bar
---
c1@1 → n1 ClientRequest id=0x02 write 0x0103666f6f03626172
n1@1 → c1 ClientResponse id=0x02 Error::Abort
c1@1 put foo=bar ⇒ Error::Abort (operation aborted)


================================================
FILE: src/raft/testscripts/node/request_follower
================================================
# Client read/write requests are proxied by followers.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# An initial get on a follower yields None.
get 2 foo
stabilize
---
c2@1 → n2 ClientRequest id=0x01 read 0x0003666f6f
n2@1 → n1 ClientRequest id=0x01 read 0x0003666f6f
n1@1 → n2 Read seq=1
n1@1 → n3 Read seq=1
n2@1 → n1 ReadResponse seq=1
n3@1 → n1 ReadResponse seq=1
n1@1 → n2 ClientResponse id=0x01 read 0x0000
n2@1 → c2 ClientResponse id=0x01 read 0x0000
c2@1 get foo ⇒ None

# Write a value on the follower.
put 2 foo=bar
stabilize
(stabilize heartbeat=true)
---
c2@1 → n2 ClientRequest id=0x02 write 0x0103666f6f03626172
n2@1 → n1 ClientRequest id=0x02 write 0x0103666f6f03626172
n1@1 append 2@1 put foo=bar
n1@1 → n2 Append base=1@1 [2@1]
n1@1 → n3 Append base=1@1 [2@1]
n2@1 append 2@1 put foo=bar
n2@1 → n1 AppendResponse match_index=2
n3@1 append 2@1 put foo=bar
n3@1 → n1 AppendResponse match_index=2
n1@1 commit 2@1
n1@1 apply 2@1 put foo=bar
n1@1 → n2 ClientResponse id=0x02 write 0x0102
n2@1 → c2 ClientResponse id=0x02 write 0x0102
c2@1 put foo=bar ⇒ 2

# Read the value back on the follower.
get 2 foo
stabilize
---
c2@1 → n2 ClientRequest id=0x03 read 0x0003666f6f
n2@1 → n1 ClientRequest id=0x03 read 0x0003666f6f
n1@1 → n2 Read seq=2
n1@1 → n3 Read seq=2
n2@1 → n1 ReadResponse seq=2
n3@1 → n1 ReadResponse seq=2
n1@1 → n2 ClientResponse id=0x03 read 0x000103626172
n2@1 → c2 ClientResponse id=0x03 read 0x000103626172
c2@1 get foo ⇒ bar


================================================
FILE: src/raft/testscripts/node/request_follower_campaign_abort
================================================
# A follower aborts in-flight requests when it steps down.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Submit a read and write on n2.
put 2 foo=bar
get 2 foo
---
c2@1 → n2 ClientRequest id=0x01 write 0x0103666f6f03626172
n2@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172
c2@1 → n2 ClientRequest id=0x02 read 0x0003666f6f
n2@1 → n1 ClientRequest id=0x02 read 0x0003666f6f

# n3 campaigns before n2's requests achieve quorum.
campaign 3
---
n3@1 follower(n1) ⇨ n3@2 candidate
n3@2 → n1 Campaign last=1@1
n3@2 → n2 Campaign last=1@1

# When n2 receives the campaign message, the requests are aborted.
deliver 2 from=3
---
n2@1 follower(n1) ⇨ n2@2 follower()
n2@1 → c2 ClientResponse id=0x01 Error::Abort
c2@1 put foo=bar ⇒ Error::Abort (operation aborted)
n2@1 → c2 ClientResponse id=0x02 Error::Abort
c2@1 get foo ⇒ Error::Abort (operation aborted)
n2@2 → n3 CampaignResponse vote=true


================================================
FILE: src/raft/testscripts/node/request_follower_disconnect_stall
================================================
# Client read/write requests stall if the follower is disconnected from the
# leader when the request is submitted. They are not retried, nor aborted.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Partition n3 away from the cluster.
partition 3
---
n3 ⇹ n1 n2

# Submit write and read requests to n3. They don't return a result.
put 3 foo=bar
get 3 foo
stabilize
---
c3@1 → n3 ClientRequest id=0x01 write 0x0103666f6f03626172
n3@1 ⇥ n1 C̶l̶i̶e̶n̶t̶R̶e̶q̶u̶e̶s̶t̶ ̶i̶d̶=̶0̶x̶0̶1̶ ̶w̶r̶i̶t̶e̶ ̶0̶x̶0̶1̶0̶3̶6̶6̶6̶f̶6̶f̶0̶3̶6̶2̶6̶1̶7̶2̶
c3@1 → n3 ClientRequest id=0x02 read 0x0003666f6f
n3@1 ⇥ n1 C̶l̶i̶e̶n̶t̶R̶e̶q̶u̶e̶s̶t̶ ̶i̶d̶=̶0̶x̶0̶2̶ ̶r̶e̶a̶d̶ ̶0̶x̶0̶0̶0̶3̶6̶6̶6̶f̶6̶f̶

# Heal the partition and heartbeat. The requests still don't return a result.
heal
---
n1 n2 n3 fully connected

stabilize heartbeat=true
---
n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0
n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0
n3@1 → n1 HeartbeatResponse match_index=1 read_seq=0


================================================
FILE: src/raft/testscripts/node/request_follower_leaderless_abort
================================================
# Client read/write requests fail on leaderless followers.

cluster nodes=3
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0

# A read request on n1 should be rejected.
get 1 foo
---
c1@0 → n1 ClientRequest id=0x01 read 0x0003666f6f
n1@0 → c1 ClientResponse id=0x01 Error::Abort
c1@0 get foo ⇒ Error::Abort (operation aborted)

# A write request on n1 should be rejected.
put 1 foo=bar
---
c1@0 → n1 ClientRequest id=0x02 write 0x0103666f6f03626172
n1@0 → c1 ClientResponse id=0x02 Error::Abort
c1@0 put foo=bar ⇒ Error::Abort (operation aborted)


================================================
FILE: src/raft/testscripts/node/request_leader
================================================
# Client read/write requests succeed on leaders.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# An initial get on the leader yields None.
get 1 foo
stabilize
---
c1@1 → n1 ClientRequest id=0x01 read 0x0003666f6f
n1@1 → n2 Read seq=1
n1@1 → n3 Read seq=1
n2@1 → n1 ReadResponse seq=1
n3@1 → n1 ReadResponse seq=1
n1@1 → c1 ClientResponse id=0x01 read 0x0000
c1@1 get foo ⇒ None

# Write a value on the leader.
put 1 foo=bar
stabilize
(stabilize heartbeat=true)
---
c1@1 → n1 ClientRequest id=0x02 write 0x0103666f6f03626172
n1@1 append 2@1 put foo=bar
n1@1 → n2 Append base=1@1 [2@1]
n1@1 → n3 Append base=1@1 [2@1]
n2@1 append 2@1 put foo=bar
n2@1 → n1 AppendResponse match_index=2
n3@1 append 2@1 put foo=bar
n3@1 → n1 AppendResponse match_index=2
n1@1 commit 2@1
n1@1 apply 2@1 put foo=bar
n1@1 → c1 ClientResponse id=0x02 write 0x0102
c1@1 put foo=bar ⇒ 2

# Read the value back on the leader.
get 1 foo
stabilize
---
c1@1 → n1 ClientRequest id=0x03 read 0x0003666f6f
n1@1 → n2 Read seq=2
n1@1 → n3 Read seq=2
n2@1 → n1 ReadResponse seq=2
n3@1 → n1 ReadResponse seq=2
n1@1 → c1 ClientResponse id=0x03 read 0x000103626172
c1@1 get foo ⇒ bar


================================================
FILE: src/raft/testscripts/node/request_leader_campaign_abort
================================================
# A leader aborts in-flight requests when it steps down.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Submit a read and write on n1.
put 1 foo=bar
get 1 foo
---
c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172
n1@1 append 2@1 put foo=bar
n1@1 → n2 Append base=1@1 [2@1]
n1@1 → n3 Append base=1@1 [2@1]
c1@1 → n1 ClientRequest id=0x02 read 0x0003666f6f
n1@1 → n2 Read seq=1
n1@1 → n3 Read seq=1

# n2 campaigns before n1's requests achieve quorum.
campaign 2
---
n2@1 follower(n1) ⇨ n2@2 candidate
n2@2 → n1 Campaign last=1@1
n2@2 → n3 Campaign last=1@1

# When n1 receives the campaign message, the requests are aborted.
deliver 1 from=2
---
n1@1 leader ⇨ n1@2 follower()
n1@1 → c1 ClientResponse id=0x01 Error::Abort
c1@1 put foo=bar ⇒ Error::Abort (operation aborted)
n1@1 → c1 ClientResponse id=0x02 Error::Abort
c1@1 get foo ⇒ Error::Abort (operation aborted)
n1@2 → n2 CampaignResponse vote=false


================================================
FILE: src/raft/testscripts/node/request_leader_change_linearizability
================================================
# A new leader that's behind on commit/apply shouldn't serve stale reads.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Write an initial value, and propagate the commit index.
(put 1 a=1)
(stabilize heartbeat=true)
status
---
n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:2→3}
n2@1 follower(n1) last=2@1 commit=2@1 applied=2
n3@1 follower(n1) last=2@1 commit=2@1 applied=2

# Write another value, but don't propagate the commit index.
(put 1 b=2)
(stabilize)
status
---
n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4}
n2@1 follower(n1) last=3@1 commit=2@1 applied=2
n3@1 follower(n1) last=3@1 commit=2@1 applied=2

# n2 now campaigns and wins, while being behind on commit/apply.
campaign 2
deliver
---
n2@1 follower(n1) ⇨ n2@2 candidate
n2@2 → n1 Campaign last=3@1
n2@2 → n3 Campaign last=3@1
n1@1 leader ⇨ n1@2 follower()
n1@2 → n2 CampaignResponse vote=true
n3@1 follower(n1) ⇨ n3@2 follower()
n3@2 → n2 CampaignResponse vote=true

# The initial append doesn't make it to the followers, so its commit index
# trails the previous leader.
partition 2
deliver 2
---
n2 ⇹ n1 n3
n2@2 candidate ⇨ n2@2 leader
n2@2 append 4@2 None
n2@2 ⇥ n1 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶2̶]̶
n2@2 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶3̶@̶1̶ ̶[̶4̶@̶2̶]̶
n2@2 ⇥ n1 H̶e̶a̶r̶t̶b̶e̶a̶t̶ ̶l̶a̶s̶t̶_̶i̶n̶d̶e̶x̶=̶4̶ ̶c̶o̶m̶m̶i̶t̶_̶i̶n̶d̶e̶x̶=̶2̶ ̶r̶e̶a̶d̶_̶s̶e̶q̶=̶0̶
n2@2 ⇥ n3 H̶e̶a̶r̶t̶b̶e̶a̶t̶ ̶l̶a̶s̶t̶_̶i̶n̶d̶e̶x̶=̶4̶ ̶c̶o̶m̶m̶i̶t̶_̶i̶n̶d̶e̶x̶=̶2̶ ̶r̶e̶a̶d̶_̶s̶e̶q̶=̶0̶

heal
status
---
n1 n2 n3 fully connected
n1@2 follower() last=3@1 commit=3@1 applied=3
n2@2 leader last=4@2 commit=2@1 applied=2 progress={1:0→5 3:0→5}
n3@2 follower() last=3@1 commit=2@1 applied=2

# Reading from n2 should not result in a stale read even if followers
# confirm the read sequence.
get 2 b
deliver
deliver
---
c2@2 → n2 ClientRequest id=0x03 read 0x000162
n2@2 → n1 Read seq=1
n2@2 → n3 Read seq=1
n1@2 follower() ⇨ n1@2 follower(n2)
n1@2 → n2 ReadResponse seq=1
n3@2 follower() ⇨ n3@2 follower(n2)
n3@2 → n2 ReadResponse seq=1

# The leader heartbeats and detects the lost appends.
heartbeat 2
deliver
deliver
deliver
---
n2@2 → n1 Heartbeat last_index=4 commit_index=2 read_seq=1
n2@2 → n3 Heartbeat last_index=4 commit_index=2 read_seq=1
n1@2 → n2 HeartbeatResponse match_index=0 read_seq=1
n3@2 → n2 HeartbeatResponse match_index=0 read_seq=1
n2@2 → n1 Append base=3@1 []
n2@2 → n3 Append base=3@1 []
n1@2 → n2 AppendResponse match_index=3
n3@2 → n2 AppendResponse match_index=3

# It resends the missing log entry.
deliver
deliver
---
n2@2 → n1 Append base=3@1 [4@2]
n2@2 → n3 Append base=3@1 [4@2]
n1@2 append 4@2 None
n1@2 → n2 AppendResponse match_index=4
n3@2 append 4@2 None
n3@2 → n2 AppendResponse match_index=4

# Once the leader receives the acks it commits the entry. The read can now be
# served, resulting in an up-to-date b=2.
stabilize
---
n2@2 commit 4@2
n2@2 apply 3@1 put b=2
n2@2 apply 4@2 None
n2@2 → c2 ClientResponse id=0x03 read 0x00010132
c2@2 get b ⇒ 2


================================================
FILE: src/raft/testscripts/node/request_leader_disconnect
================================================
# Client read/write requests succeed if the leader is disconnected from the
# quorum when the request is submitted but it later reconnects.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Partition n1 away from the cluster.
partition 1
---
n1 ⇹ n2 n3

# Submit write and read requests to n1. They don't return a result.
put 1 foo=bar
get 1 foo
stabilize
---
c1@1 → n1 ClientRequest id=0x01 write 0x0103666f6f03626172
n1@1 append 2@1 put foo=bar
n1@1 ⇥ n2 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶
n1@1 ⇥ n3 A̶p̶p̶e̶n̶d̶ ̶b̶a̶s̶e̶=̶1̶@̶1̶ ̶[̶2̶@̶1̶]̶
c1@1 → n1 ClientRequest id=0x02 read 0x0003666f6f
n1@1 ⇥ n2 R̶e̶a̶d̶ ̶s̶e̶q̶=̶1̶
n1@1 ⇥ n3 R̶e̶a̶d̶ ̶s̶e̶q̶=̶1̶

# Heal the partition and heartbeat. The requests eventually return results.
heal
---
n1 n2 n3 fully connected

stabilize heartbeat=true
---
n1@1 → n2 Heartbeat last_index=2 commit_index=1 read_seq=1
n1@1 → n3 Heartbeat last_index=2 commit_index=1 read_seq=1
n2@1 → n1 HeartbeatResponse match_index=0 read_seq=1
n3@1 → n1 HeartbeatResponse match_index=0 read_seq=1
n1@1 → c1 ClientResponse id=0x02 read 0x0000
c1@1 get foo ⇒ None
n1@1 → n2 Append base=1@1 [2@1]
n1@1 → n3 Append base=1@1 [2@1]
n2@1 append 2@1 put foo=bar
n2@1 → n1 AppendResponse match_index=2
n3@1 append 2@1 put foo=bar
n3@1 → n1 AppendResponse match_index=2
n1@1 commit 2@1
n1@1 apply 2@1 put foo=bar
n1@1 → c1 ClientResponse id=0x01 write 0x0102
c1@1 put foo=bar ⇒ 2


================================================
FILE: src/raft/testscripts/node/request_leader_read_quorum
================================================
# Client read requests are only processed once a quorum confirms the read sequence.

cluster nodes=5 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1

# Write foo=bar.
(put 1 foo=bar)
(stabilize heartbeat=true)
---
ok

# Read it once.
(get 1 foo)
(stabilize)
---
ok

# Attempt to read it again. The read only returns once a quorum have
# confirmed the read sequence.
get 1 foo
---
c1@1 → n1 ClientRequest id=0x03 read 0x0003666f6f
n1@1 → n2 Read seq=2
n1@1 → n3 Read seq=2
n1@1 → n4 Read seq=2
n1@1 → n5 Read seq=2

deliver 2
deliver 1
---
n2@1 → n1 ReadResponse seq=2

deliver 3
deliver 1
---
n3@1 → n1 ReadResponse seq=2
n1@1 → c1 ClientResponse id=0x03 read 0x000103626172
c1@1 get foo ⇒ bar

(stabilize)
---
ok


================================================
FILE: src/raft/testscripts/node/request_leader_read_quorum_sequence
================================================
# Client read requests are only served once a quorum confirm the read sequence
# number, including higher sequence numbers.

cluster nodes=5 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2 4:1→2 5:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1
n4@1 follower(n1) last=1@1 commit=1@1 applied=1
n5@1 follower(n1) last=1@1 commit=1@1 applied=1

# Write foo=bar and read it back.
(put 1 foo=bar)
(stabilize heartbeat=true)
(get 1 foo)
(stabilize)
---
ok

# Send a heartbeat with sequence number 1, and deliver it to all followers.
heartbeat 1
deliver
---
n1@1 → n2 Heartbeat last_index=2 commit_index=2 read_seq=1
n1@1 → n3 Heartbeat last_index=2 commit_index=2 read_seq=1
n1@1 → n4 Heartbeat last_index=2 commit_index=2 read_seq=1
n1@1 → n5 Heartbeat last_index=2 commit_index=2 read_seq=1
n2@1 → n1 HeartbeatResponse match_index=2 read_seq=1
n3@1 → n1 HeartbeatResponse match_index=2 read_seq=1
n4@1 → n1 HeartbeatResponse match_index=2 read_seq=1
n5@1 → n1 HeartbeatResponse match_index=2 read_seq=1

# Partition n1 away.
partition 1
---
n1 ⇹ n2 n3 n4 n5

# Perform a read at sequence number 2. The read messages are lost.
get 1 foo
---
c1@1 → n1 ClientRequest id=0x03 read 0x0003666f6f
n1@1 ⇥ n2 R̶e̶a̶d̶ ̶s̶e̶q̶=̶2̶
n1@1 ⇥ n3 R̶e̶a̶d̶ ̶s̶e̶q̶=̶2̶
n1@1 ⇥ n4 R̶e̶a̶d̶ ̶s̶e̶q̶=̶2̶
n1@1 ⇥ n5 R̶e̶a̶d̶ ̶s̶e̶q̶=̶2̶

# Deliver the heartbeat responses at sequence number 1. These should not satisfy
# the read at sequence number 2.
deliver 1
---
ok

# Heal the partition and perform another read at sequence number 3. Followers
# respond to the reads at sequence number 3.
heal
get 1 foo
---
n1 n2 n3 n4 n5 fully connected
c1@1 → n1 ClientRequest id=0x04 read 0x0003666f6f
n1@1 → n2 Read seq=3
n1@1 → n3 Read seq=3
n1@1 → n4 Read seq=3
n1@1 → n5 Read seq=3

deliver
---
n2@1 → n1 ReadResponse seq=3
n3@1 → n1 ReadResponse seq=3
n4@1 → n1 ReadResponse seq=3
n5@1 → n1 ReadResponse seq=3

# Once n1 receives two responses it has a read quorum and serves both the read
# at seqnums 2 (id=0x03) and 3 (id=0x04).
deliver 1 from=3
deliver 1 from=5
---
n1@1 → c1 ClientResponse id=0x03 read 0x000103626172
c1@1 get foo ⇒ bar
n1@1 → c1 ClientResponse id=0x04 read 0x000103626172
c1@1 get foo ⇒ bar


================================================
FILE: src/raft/testscripts/node/request_leader_single
================================================
# Client read/write requests succeed on a lone leader.

cluster nodes=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={}

# An initial get on the leader yields None.
get 1 foo
stabilize
---
c1@1 → n1 ClientRequest id=0x01 read 0x0003666f6f
n1@1 → c1 ClientResponse id=0x01 read 0x0000
c1@1 get foo ⇒ None

# Write a value on the leader.
put 1 foo=bar
stabilize heartbeat=true
---
c1@1 → n1 ClientRequest id=0x02 write 0x0103666f6f03626172
n1@1 append 2@1 put foo=bar
n1@1 commit 2@1
n1@1 apply 2@1 put foo=bar
n1@1 → c1 ClientResponse id=0x02 write 0x0102
c1@1 put foo=bar ⇒ 2

# Read the value back on the leader.
get 1 foo
stabilize
---
c1@1 → n1 ClientRequest id=0x03 read 0x0003666f6f
n1@1 → c1 ClientResponse id=0x03 read 0x000103626172
c1@1 get foo ⇒ bar


================================================
FILE: src/raft/testscripts/node/request_status
================================================
# Status requests return the cluster status.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Partition away n3, so not all nodes have the same log position.
partition 3
---
n3 ⇹ n1 n2

# Replicate a write, but not the commit index.
(put 1 foo=bar)
(stabilize)
status
---
n1@1 leader last=2@1 commit=2@1 applied=2 progress={2:2→3 3:1→3}
n2@1 follower(n1) last=2@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Run a status request on the leader.
status request=true 1
stabilize
---
c1@1 → n1 ClientRequest id=0x02 status
n1@1 → c1 ClientResponse id=0x02 status Status { leader: 1, term: 1, match_index: {1: 2, 2: 2, 3: 1}, commit_index: 2, applied_index: 2, storage: Status { name: "bitcask", keys: 4, size: 41, disk_size: 84, live_disk_size: 73 } }
c1@1 status ⇒ Status {
    leader: 1,
    term: 1,
    match_index: {
        1: 2,
        2: 2,
        3: 1,
    },
    commit_index: 2,
    applied_index: 2,
    storage: Status {
        name: "bitcask",
        keys: 4,
        size: 41,
        disk_size: 84,
        live_disk_size: 73,
    },
}

# Run a status request on a follower.
status request=true 2
stabilize
---
c2@1 → n2 ClientRequest id=0x03 status
n2@1 → n1 ClientRequest id=0x03 status
n1@1 → n2 ClientResponse id=0x03 status Status { leader: 1, term: 1, match_index: {1: 2, 2: 2, 3: 1}, commit_index: 2, applied_index: 2, storage: Status { name: "bitcask", keys: 4, size: 41, disk_size: 84, live_disk_size: 73 } }
n2@1 → c2 ClientResponse id=0x03 status Status { leader: 1, term: 1, match_index: {1: 2, 2: 2, 3: 1}, commit_index: 2, applied_index: 2, storage: Status { name: "bitcask", keys: 4, size: 41, disk_size: 84, live_disk_size: 73 } }
c2@1 status ⇒ Status {
    leader: 1,
    term: 1,
    match_index: {
        1: 2,
        2: 2,
        3: 1,
    },
    commit_index: 2,
    applied_index: 2,
    storage: Status {
        name: "bitcask",
        keys: 4,
        size: 41,
        disk_size: 84,
        live_disk_size: 73,
    },
}


================================================
FILE: src/raft/testscripts/node/request_status_single
================================================
# Status requests return the cluster status on a single node.

cluster nodes=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={}

# Perform a write.
(put 1 foo=bar)
(stabilize)
status
---
n1@1 leader last=2@1 commit=2@1 applied=2 progress={}

# Run a status request on the leader.
status request=true 1
stabilize
---
c1@1 → n1 ClientRequest id=0x02 status
n1@1 → c1 ClientResponse id=0x02 status Status { leader: 1, term: 1, match_index: {1: 2}, commit_index: 2, applied_index: 2, storage: Status { name: "bitcask", keys: 4, size: 41, disk_size: 84, live_disk_size: 73 } }
c1@1 status ⇒ Status {
    leader: 1,
    term: 1,
    match_index: {
        1: 2,
    },
    commit_index: 2,
    applied_index: 2,
    storage: Status {
        name: "bitcask",
        keys: 4,
        size: 41,
        disk_size: 84,
        live_disk_size: 73,
    },
}


================================================
FILE: src/raft/testscripts/node/restart
================================================
# Restarting a cluster that's fully caught up retains the existing state and
# allows trivially electing a new leader.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Replicate a couple of writes.
(put 1 a=1)
(put 1 b=2)
(stabilize heartbeat=true)
---
ok

# Dump the current status, log, and state.
status
---
n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4}
n2@1 follower(n1) last=3@1 commit=3@1 applied=3
n3@1 follower(n1) last=3@1 commit=3@1 applied=3

log
---
n1@1 term=1 last=3@1 commit=3@1 vote=Some(1)
n1@1 entry 1@1 None
n1@1 entry 2@1 put a=1
n1@1 entry 3@1 put b=2
n2@1 term=1 last=3@1 commit=3@1 vote=Some(1)
n2@1 entry 1@1 None
n2@1 entry 2@1 put a=1
n2@1 entry 3@1 put b=2
n3@1 term=1 last=3@1 commit=3@1 vote=Some(1)
n3@1 entry 1@1 None
n3@1 entry 2@1 put a=1
n3@1 entry 3@1 put b=2

state
---
n1@1 applied=3
n1@1 state a=1
n1@1 state b=2
n2@1 applied=3
n2@1 state a=1
n2@1 state b=2
n3@1 applied=3
n3@1 state a=1
n3@1 state b=2

# Restart the nodes. They retain the same status, logs, and state.
restart
---
n1@1 follower() last=3@1 commit=3@1 applied=3
n2@1 follower() last=3@1 commit=3@1 applied=3
n3@1 follower() last=3@1 commit=3@1 applied=3

log
---
n1@1 term=1 last=3@1 commit=3@1 vote=Some(1)
n1@1 entry 1@1 None
n1@1 entry 2@1 put a=1
n1@1 entry 3@1 put b=2
n2@1 term=1 last=3@1 commit=3@1 vote=Some(1)
n2@1 entry 1@1 None
n2@1 entry 2@1 put a=1
n2@1 entry 3@1 put b=2
n3@1 term=1 last=3@1 commit=3@1 vote=Some(1)
n3@1 entry 1@1 None
n3@1 entry 2@1 put a=1
n3@1 entry 3@1 put b=2

state
---
n1@1 applied=3
n1@1 state a=1
n1@1 state b=2
n2@1 applied=3
n2@1 state a=1
n2@1 state b=2
n3@1 applied=3
n3@1 state a=1
n3@1 state b=2

# Elect a new leader.
campaign 3
stabilize heartbeat=true
---
n3@1 follower() ⇨ n3@2 candidate
n3@2 → n1 Campaign last=3@1
n3@2 → n2 Campaign last=3@1
n1@1 follower() ⇨ n1@2 follower()
n1@2 → n3 CampaignResponse vote=true
n2@1 follower() ⇨ n2@2 follower()
n2@2 → n3 CampaignResponse vote=true
n3@2 candidate ⇨ n3@2 leader
n3@2 append 4@2 None
n3@2 → n1 Append base=3@1 [4@2]
n3@2 → n2 Append base=3@1 [4@2]
n3@2 → n1 Heartbeat last_index=4 commit_index=3 read_seq=0
n3@2 → n2 Heartbeat last_index=4 commit_index=3 read_seq=0
n1@2 follower() ⇨ n1@2 follower(n3)
n1@2 append 4@2 None
n1@2 → n3 AppendResponse match_index=4
n1@2 → n3 HeartbeatResponse match_index=4 read_seq=0
n2@2 follower() ⇨ n2@2 follower(n3)
n2@2 append 4@2 None
n2@2 → n3 AppendResponse match_index=4
n2@2 → n3 HeartbeatResponse match_index=4 read_seq=0
n3@2 commit 4@2
n3@2 → n1 Heartbeat last_index=4 commit_index=4 read_seq=0
n3@2 → n2 Heartbeat last_index=4 commit_index=4 read_seq=0
n1@2 commit 4@2
n1@2 → n3 HeartbeatResponse match_index=4 read_seq=0
n2@2 commit 4@2
n2@2 → n3 HeartbeatResponse match_index=4 read_seq=0

status
---
n1@2 follower(n3) last=4@2 commit=4@2 applied=4
n2@2 follower(n3) last=4@2 commit=4@2 applied=4
n3@2 leader last=4@2 commit=4@2 applied=4 progress={1:4→5 2:4→5}


================================================
FILE: src/raft/testscripts/node/restart_apply
================================================
# Restarting a node and wiping its state machine will reapply the state.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Replicate a couple of writes.
(put 1 a=1)
(put 1 b=2)
(stabilize heartbeat=true)
---
ok

# Restart n3 and clear its state machine. The node will apply all pending
# entries when restarting.
restart 3 applied_index=0
---
n3@1 apply 1@1 None
n3@1 apply 2@1 put a=1
n3@1 apply 3@1 put b=2
n3@1 follower() last=3@1 commit=3@1 applied=3

state 3
---
n3@1 applied=3
n3@1 state a=1
n3@1 state b=2

# Restart n3 and lose the last write. It will also be reapplied.
restart 3 applied_index=2
---
n3@1 apply 3@1 put b=2
n3@1 follower() last=3@1 commit=3@1 applied=3

state 3
---
n3@1 applied=3
n3@1 state a=1
n3@1 state b=2


================================================
FILE: src/raft/testscripts/node/restart_commit_recover
================================================
# Restarting the cluster and wiping the commit indexes allows
# a new leader to recover the commit index.

cluster nodes=3 leader=1
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Replicate a couple of writes, but don't propagate the commit index.
(put 1 a=1)
(put 1 b=2)
(stabilize)
status
---
n1@1 leader last=3@1 commit=3@1 applied=3 progress={2:3→4 3:3→4}
n2@1 follower(n1) last=3@1 commit=1@1 applied=1
n3@1 follower(n1) last=3@1 commit=1@1 applied=1

# Restart all nodes and wipe the commit index.
restart commit_index=0
---
n1@1 follower() last=3@1 commit=0@0 applied=3
n2@1 follower() last=3@1 commit=0@0 applied=1
n3@1 follower() last=3@1 commit=0@0 applied=1

# n3 campaigns for leadership and recovers the commit index.
campaign 3
stabilize
---
n3@1 follower() ⇨ n3@2 candidate
n3@2 → n1 Campaign last=3@1
n3@2 → n2 Campaign last=3@1
n1@1 follower() ⇨ n1@2 follower()
n1@2 → n3 CampaignResponse vote=true
n2@1 follower() ⇨ n2@2 follower()
n2@2 → n3 CampaignResponse vote=true
n3@2 candidate ⇨ n3@2 leader
n3@2 append 4@2 None
n3@2 → n1 Append base=3@1 [4@2]
n3@2 → n2 Append base=3@1 [4@2]
n3@2 → n1 Heartbeat last_index=4 commit_index=0 read_seq=0
n3@2 → n2 Heartbeat last_index=4 commit_index=0 read_seq=0
n1@2 follower() ⇨ n1@2 follower(n3)
n1@2 append 4@2 None
n1@2 → n3 AppendResponse match_index=4
n1@2 → n3 HeartbeatResponse match_index=4 read_seq=0
n2@2 follower() ⇨ n2@2 follower(n3)
n2@2 append 4@2 None
n2@2 → n3 AppendResponse match_index=4
n2@2 → n3 HeartbeatResponse match_index=4 read_seq=0
n3@2 commit 4@2

status
---
n1@2 follower(n3) last=4@2 commit=0@0 applied=3
n2@2 follower(n3) last=4@2 commit=0@0 applied=1
n3@2 leader last=4@2 commit=4@2 applied=4 progress={1:4→5 2:4→5}

# A heartbeat propagates the commit index.
heartbeat 3
stabilize
---
n3@2 → n1 Heartbeat last_index=4 commit_index=4 read_seq=0
n3@2 → n2 Heartbeat last_index=4 commit_index=4 read_seq=0
n1@2 commit 4@2
n1@2 → n3 HeartbeatResponse match_index=4 read_seq=0
n2@2 commit 4@2
n2@2 → n3 HeartbeatResponse match_index=4 read_seq=0


================================================
FILE: src/raft/testscripts/node/restart_term_vote
================================================
# The term/vote is retained across a restart.

cluster nodes=3
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0

# Start a new election on n1.
campaign 1
---
n1@0 follower() ⇨ n1@1 candidate
n1@1 → n2 Campaign last=0@0
n1@1 → n3 Campaign last=0@0

# n3 votes for n1, and then restarts.
deliver 3
---
n3@0 follower() ⇨ n3@1 follower()
n3@1 → n1 CampaignResponse vote=true

restart 3
---
n3@1 follower() last=0@0 commit=0@0 applied=0

# n3 still has a record of the term and vote in the log.
log 3
---
n3@1 term=1 last=0@0 commit=0@0 vote=Some(1)

# n2 also campaigns. n3 does not grant its vote.
campaign 2
---
n2@0 follower() ⇨ n2@1 candidate
n2@1 → n1 Campaign last=0@0
n2@1 → n3 Campaign last=0@0

deliver 3
---
n3@1 → n2 CampaignResponse vote=false

# n1 wins leadership.
(stabilize)
status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=0@0 applied=0
n3@1 follower(n1) last=1@1 commit=0@0 applied=0


================================================
FILE: src/raft/testscripts/node/tick_candidate
================================================
# Ticking a candidate will eventually hold a new election in a later term.

cluster nodes=3 heartbeat_interval=1 election_timeout=2
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0

# n1 campaigns.
campaign 1
---
n1@0 follower() ⇨ n1@1 candidate
n1@1 → n2 Campaign last=0@0
n1@1 → n3 Campaign last=0@0

# A single tick does nothing.
tick 1
---
ok

# Another tick campaigns in a later term.
tick 1
---
n1@1 candidate ⇨ n1@2 candidate
n1@2 → n2 Campaign last=0@0
n1@2 → n3 Campaign last=0@0

status
---
n1@2 candidate last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0


================================================
FILE: src/raft/testscripts/node/tick_follower
================================================
# Ticking a follower will transition it to candidate if it hasn't
# heard from the leader in a while.

cluster nodes=3 leader=1 heartbeat_interval=1 election_timeout=2
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# A single follower tick does nothing.
tick 2
---
ok

# If n1 heartbeats, the election counter is reset, and another n2 tick does nothing.
heartbeat 1
stabilize
---
n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0
n2@1 → n1 HeartbeatResponse match_index=1 read_seq=0
n3@1 → n1 HeartbeatResponse match_index=1 read_seq=0

tick 2
---
ok

# Ticking n2 again exceeds the election timeout, making it campaign.
tick 2
---
n2@1 follower(n1) ⇨ n2@2 candidate
n2@2 → n1 Campaign last=1@1
n2@2 → n3 Campaign last=1@1

status
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@2 candidate last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1


================================================
FILE: src/raft/testscripts/node/tick_follower_leaderless
================================================
# Ticking a leaderless follower will eventually transition it to candidate.

cluster nodes=3 heartbeat_interval=1 election_timeout=2
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@0 follower() last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0

# A single follower tick does nothing.
tick 2
---
ok

# Another tick makes it campaign.
tick 2
---
n2@0 follower() ⇨ n2@1 candidate
n2@1 → n1 Campaign last=0@0
n2@1 → n3 Campaign last=0@0

status
---
n1@0 follower() last=0@0 commit=0@0 applied=0
n2@1 candidate last=0@0 commit=0@0 applied=0
n3@0 follower() last=0@0 commit=0@0 applied=0


================================================
FILE: src/raft/testscripts/node/tick_leader
================================================
# Ticking a leader should cause it to emit heartbeats, even when it doesn't
# hear back from any followers.

cluster nodes=3 leader=1 heartbeat_interval=1 election_timeout=2
---
n1@1 leader last=1@1 commit=1@1 applied=1 progress={2:1→2 3:1→2}
n2@1 follower(n1) last=1@1 commit=1@1 applied=1
n3@1 follower(n1) last=1@1 commit=1@1 applied=1

# Ticking n1 will emit a heartbeat.
tick 1
---
n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0

# Ticking n1 again will emit further heartbeats, even when it hasn't heard from
# any followers.
tick 1
tick 1
tick 1
---
n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0
n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0
n1@1 → n2 Heartbeat last_index=1 commit_index=1 read_seq=0
n1@1 → n3 Heartbeat last_index=1 commit_index=1 read_seq=0


================================================
FILE: src/server.rs
================================================
use std::collections::HashMap;
use std::io::{BufReader, BufWriter, Write as _};
use std::net::{TcpListener, TcpStream, ToSocketAddrs};
use std::time::Duration;

use crossbeam::channel::{Receiver, Sender};
use log::{debug, error, info};
use serde::{Deserialize, Serialize};
use uuid::Uuid;

use crate::encoding::{self, Value as _};
use crate::error::Result;
use crate::raft;
use crate::sql;
use crate::sql::engine::{Catalog as _, Engine as _};
use crate::sql::execution::StatementResult;
use crate::sql::types::{Row, Table};
use crate::storage;

/// The outbound Raft peer channel capacity. This buffers messages when a Raft
/// peer is slow or unavailable. Beyond this, messages will be dropped.
const RAFT_PEER_CHANNEL_CAPACITY: usize = 1000;

/// The retry interval when connecting to a Raft peer.
const RAFT_PEER_RETRY_INTERVAL: Duration = Duration::from_secs(1);

/// A toyDB server. Routes messages to/from an inner Raft node.
///
/// * Listens for inbound SQL connections from clients via TCP and passes
///   requests to the local Raft node.
///
/// * Listens for inbound Raft connections from other toyDB nodes via TCP and
///   passes messages to the local Raft node.
///
/// * Connects to other toyDB nodes via TCP and sends outbound Raft messages
///   from the local Raft node.
pub struct Server {
    /// The inner Raft node.
    node: raft::Node,
    /// Outbound messages from the Raft node.
    node_rx: Receiver<raft::Envelope>,
    /// Raft peer IDs and addresses.
    peers: HashMap<raft::NodeID, String>,
}

impl Server {
    /// Creates a new toyDB server.
    pub fn new(
        id: raft::NodeID,
        peers: HashMap<raft::NodeID, String>,
        raft_log: raft::Log,
        raft_state: Box<dyn raft::State>,
    ) -> Result<Self> {
        let (node_tx, node_rx) = crossbeam::channel::unbounded();
        let node = raft::Node::new(
            id,
            peers.keys().copied().collect(),
            raft_log,
            raft_state,
            node_tx,
            raft::Options::default(),
        )?;
        Ok(Self { node, peers, node_rx })
    }

    /// Serves Raft and SQL requests indefinitely. Consumes the server.
    pub fn serve(self, raft_addr: impl ToSocketAddrs, sql_addr: impl ToSocketAddrs) -> Result<()> {
        let raft_listener = TcpListener::bind(raft_addr)?;
        let sql_listener = TcpListener::bind(sql_addr)?;
        info!(
            "Listening on {} (SQL) and {} (Raft)",
            sql_listener.local_addr()?,
            raft_listener.local_addr()?
        );

        std::thread::scope(move |s| {
            let id = self.node.id();
            let (raft_request_tx, raft_request_rx) = crossbeam::channel::unbounded();
            let (raft_step_tx, raft_step_rx) = crossbeam::channel::unbounded();

            // Serve inbound Raft connections.
            s.spawn(move || Self::raft_accept(raft_listener, raft_step_tx));

            // Establish outbound Raft connections to peers.
            let mut raft_peers_tx = HashMap::new();
            for (id, addr) in self.peers.into_iter() {
                let (raft_peer_tx, raft_peer_rx) =
                    crossbeam::channel::bounded(RAFT_PEER_CHANNEL_CAPACITY);
                raft_peers_tx.insert(id, raft_peer_tx);
                s.spawn(move || Self::raft_send_peer(addr, raft_peer_rx));
            }

            // Route Raft messages between the local node, peers, and clients.
            s.spawn(move || {
                Self::raft_route(
                    self.node,
                    self.node_rx,
                    raft_step_rx,
                    raft_peers_tx,
                    raft_request_rx,
                )
            });

            // Serve inbound SQL connections.
            let sql_engine = sql::engine::Raft::new(raft_request_tx);
            s.spawn(move || Self::sql_accept(id, sql_listener, sql_engine));
        });

        Ok(())
    }

    /// Accepts new inbound Raft connections from peers and spawns threads
    /// routing inbound messages to the local Raft node.
    fn raft_accept(listener: TcpListener, raft_step_tx: Sender<raft::Envelope>) {
        std::thread::scope(|s| {
            loop {
                let (socket, peer) = match listener.accept() {
                    Ok((socket, peer)) => (socket, peer),
                    Err(err) => {
                        error!("Raft peer accept failed: {err}");
                        continue;
                    }
                };
                let raft_step_tx = raft_step_tx.clone();
                s.spawn(move || {
                    debug!("Raft peer {peer} connected");
                    match Self::raft_receive_peer(socket, raft_step_tx) {
                        Ok(()) => debug!("Raft peer {peer} disconnected"),
                        Err(err) => error!("Raft peer {peer} error: {err}"),
                    }
                });
            }
        });
    }

    /// Receives inbound messages from a peer via TCP, and queues them for
    /// stepping into the Raft node.
    fn raft_receive_peer(socket: TcpStream, raft_step_tx: Sender<raft::Envelope>) -> Result<()> {
        let mut socket = BufReader::new(socket);
        while let Some(message) = raft::Envelope::maybe_decode_from(&mut socket)? {
            raft_step_tx.send(message)?;
        }
        Ok(())
    }

    /// Sends outbound messages to a peer via TCP. Retries indefinitely if the
    /// connection fails.
    fn raft_send_peer(addr: String, raft_node_rx: Receiver<raft::Envelope>) {
        loop {
            let mut socket = match TcpStream::connect(&addr) {
                Ok(socket) => BufWriter::new(socket),
                Err(err) => {
                    error!("Failed connecting to Raft peer {addr}: {err}");
                    std::thread::sleep(RAFT_PEER_RETRY_INTERVAL);
                    continue;
                }
            };
            while let Ok(message) = raft_node_rx.recv() {
                if let Err(err) = message.encode_into(&mut socket).and_then(|_| Ok(socket.flush()?))
                {
                    error!("Failed sending to Raft peer {addr}: {err}");
                    break;
                }
            }
            debug!("Disconnected from Raft peer {addr}");
        }
    }

    /// Routes Raft messages:
    ///
    /// * node_rx: outbound messages from the local Raft node. Routed to peers
    ///   via TCP, or to local clients via a response channel.
    ///
    /// * request_rx: inbound requests from local SQL clients. Stepped into
    ///   the local Raft node as ClientRequest messages. Responses are returned
    ///   via the provided response channel.
    ///
    /// * peers_rx: inbound messages from remote Raft peers. Stepped into the
    ///   local Raft node.
    ///
    /// * peers_tx: outbound per-peer channels sent via TCP connections.
    ///   Messages from the local node's node_rx are sent here.
    ///
    /// Panics on any errors, since the Raft node can't recover from failed
    /// state transitions.
    fn raft_route(
        mut node: raft::Node,
        node_rx: Receiver<raft::Envelope>,
        peers_rx: Receiver<raft::Envelope>,
        mut peers_tx: HashMap<raft::NodeID, Sender<raft::Envelope>>,
        request_rx: Receiver<(raft::Request, Sender<Result<raft::Response>>)>,
    ) {
        // Track response channels by request ID. The Raft node will emit
        // ClientResponse messages that we forward to the response channel.
        let mut response_txs = HashMap::<raft::RequestID, Sender<Result<raft::Response>>>::new();

        let ticker = crossbeam::channel::tick(raft::TICK_INTERVAL);
        loop {
            crossbeam::select! {
                // Periodically tick the node.
                recv(ticker) -> _ => node = node.tick().expect("tick failed"),

                // Step messages from peers into the node.
                recv(peers_rx) -> result => {
                    let msg = result.expect("peers_rx disconnected");
                    node = node.step(msg).expect("step failed");
                },

                // Send outbound messages from the node to the appropriate peer.
                // If we receive a client response addressed to the local node,
                // forward it to the waiting client via the response channel.
                recv(node_rx) -> result => {
                    let msg = result.expect("node_rx disconnected");
                    if msg.to == node.id()
                        && let raft::Message::ClientResponse{ id, response } = msg.message
                    {
                        if let Some(response_tx) = response_txs.remove(&id) {
                            response_tx.send(response).expect("response_tx disconnected");
                        }
                        continue
                    }
                    let peer_tx = peers_tx.get_mut(&msg.to).expect("unknown peer");
                    match peer_tx.try_send(msg) {
                        Ok(()) => {},
                        Err(crossbeam::channel::TrySendError::Full(_)) => {
                            error!("Raft peer channel full, dropping message");
                        },
                        Err(crossbeam::channel::TrySendError::Disconnected(_)) => {
                            panic!("peer_tx disconnected");
                        },
                    };
                }

                // Track inbound client requests and step them into the node.
                recv(request_rx) -> result => {
                    let (request, response_tx) = result.expect("request_rx disconnected");
                    let id = Uuid::new_v4();
                    let msg = raft::Envelope{
                        from: node.id(),
                        to: node.id(),
                        term: node.term(),
                        message: raft::Message::ClientRequest{id, request},
                    };
                    node = node.step(msg).expect("step failed");
                    response_txs.insert(id, response_tx);
                }
            }
        }
    }

    /// Accepts new SQL client connections and spawns session threads for them.
    fn sql_accept(id: raft::NodeID, listener: TcpListener, sql_engine: sql::engine::Raft) {
        std::thread::scope(|s| {
            loop {
                let (socket, peer) = match listener.accept() {
                    Ok((socket, peer)) => (socket, peer),
                    Err(err) => {
                        error!("Client accept failed: {err}");
                        continue;
                    }
                };
                let session = sql_engine.session();
                s.spawn(move || {
                    debug!("Client {peer} connected");
                    match Self::sql_session(id, socket, session) {
                        Ok(()) => debug!("Client {peer} disconnected"),
                        Err(err) => error!("Client {peer} error: {err}"),
                    }
                });
            }
        })
    }

    /// Processes a client SQL session, executing SQL statements against the
    /// Raft node.
    fn sql_session(
        id: raft::NodeID,
        socket: TcpStream,
        mut session: sql::execution::Session<sql::engine::Raft>,
    ) -> Result<()> {
        let mut reader = BufReader::new(socket.try_clone()?);
        let mut writer = BufWriter::new(socket);

        while let Some(request) = Request::maybe_decode_from(&mut reader)? {
            // Execute request.
            debug!("Received request {request:?}");
            let response = match request {
                Request::Execute(query) => session.execute(&query).map(Response::Execute),
                Request::GetTable(table) => {
                    session.with_txn(true, |txn| txn.must_get_table(&table)).map(Response::GetTable)
                }
                Request::ListTables => session
                    .with_txn(true, |txn| {
                        Ok(txn.list_tables()?.into_iter().map(|t| t.name).collect())
                    })
                    .map(Response::ListTables),
                Request::Status => session
                    .status()
                    .map(|s| Status { server: id, raft: s.raft, mvcc: s.mvcc })
                    .map(Response::Status),
            };

            // Process response.
            debug!("Returning response {response:?}");
            response.encode_into(&mut writer)?;
            writer.flush()?;
        }
        Ok(())
    }
}

/// A SQL client request.
#[derive(Debug, Serialize, Deserialize)]
pub enum Request {
    /// Executes a SQL statement.
    Execute(String),
    /// Fetches the given table schema.
    GetTable(String),
    /// Lists all tables.
    ListTables,
    /// Returns server status.
    Status,
}

impl encoding::Value for Request {}

/// A SQL server response.
#[derive(Debug, Serialize, Deserialize)]
pub enum Response {
    Execute(StatementResult),
    Row(Option<Row>),
    GetTable(Table),
    ListTables(Vec<String>),
    Status(Status),
}

impl encoding::Value for Response {}

/// SQL server status.
#[derive(Debug, PartialEq, Serialize, Deserialize)]
pub struct Status {
    pub server: raft::NodeID,
    pub raft: raft::Status,
    pub mvcc: storage::mvcc::Status,
}


================================================
FILE: src/sql/engine/engine.rs
================================================
use std::collections::{BTreeMap, BTreeSet};

use crate::errinput;
use crate::error::Result;
use crate::sql::execution::Session;
use crate::sql::types::{Expression, Row, Rows, Table, Value};
use crate::storage::mvcc;

/// A SQL engine. This provides low-level CRUD (create, read, update, delete)
/// operations for table rows, a schema catalog for accessing and modifying
/// table schemas, and interactive SQL sessions that execute client SQL
/// statements. All engine access is transactional with snapshot isolation.
pub trait Engine<'a>: Sized {
    /// The engine's transaction type. This provides both row-level CRUD operations and
    /// transactional access to the schema catalog.
    type Transaction: Transaction + 'a;

    /// Begins a read-write transaction.
    fn begin(&'a self) -> Result<Self::Transaction>;
    /// Begins a read-only transaction.
    fn begin_read_only(&'a self) -> Result<Self::Transaction>;
    /// Begins a read-only transaction as of a historical version.
    fn begin_as_of(&'a self, version: mvcc::Version) -> Result<Self::Transaction>;

    /// Creates a client session for executing SQL statements.
    fn session(&'a self) -> Session<'a, Self> {
        Session::new(self)
    }
}

/// A SQL transaction. Executes transactional CRUD operations on table rows.
/// Provides snapshot isolation (see `storage::mvcc` module for details).
///
/// All methods operate on row batches rather than single rows to amortize the
/// cost. With the Raft engine, each call results in a Raft roundtrip, and we'd
/// rather not have to do that for every single row that's modified.
pub trait Transaction: Catalog {
    /// The transaction's internal MVCC state.
    fn state(&self) -> &mvcc::TransactionState;

    /// Commits the transaction.
    fn commit(self) -> Result<()>;
    /// Rolls back the transaction.
    fn rollback(self) -> Result<()>;

    /// Deletes table rows by primary key, if they exist.
    fn delete(&self, table: &str, ids: &[Value]) -> Result<()>;
    /// Fetches table rows by primary key, if they exist.
    fn get(&self, table: &str, ids: &[Value]) -> Result<Vec<Row>>;
    /// Inserts new table rows.
    fn insert(&self, table: &str, rows: Vec<Row>) -> Result<()>;
    /// Looks up a set of primary keys by index values. BTreeSet for testing.
    fn lookup_index(&self, table: &str, column: &str, values: &[Value]) -> Result<BTreeSet<Value>>;
    /// Scans a table's rows, optionally applying the given filter.
    fn scan(&self, table: &str, filter: Option<Expression>) -> Result<Rows>;
    /// Updates table rows by primary key. BTreeMap for testing.
    fn update(&self, table: &str, rows: BTreeMap<Value, Row>) -> Result<()>;
}

/// The catalog stores table schema information. It must be implemented for
/// Transaction, and is thus fully transactional. For simplicity, it only
/// supports creating and dropping tables -- there are no ALTER TABLE schema
/// changes, nor CREATE INDEX.
pub trait Catalog {
    /// Creates a new table. Errors if it already exists.
    fn create_table(&self, table: Table) -> Result<()>;
    /// Drops a table. Errors if it does not exist, unless if_exists is true.
    /// Returns true if the table existed and was deleted.
    fn drop_table(&self, table: &str, if_exists: bool) -> Result<bool>;
    /// Fetches a table schema, or None if it doesn't exist.
    fn get_table(&self, table: &str) -> Result<Option<Table>>;
    /// Returns a list of all table schemas.
    fn list_tables(&self) -> Result<Vec<Table>>;

    /// Fetches a table schema, or errors if it does not exist.
    fn must_get_table(&self, table: &str) -> Result<Table> {
        self.get_table(table)?.ok_or_else(|| errinput!("table {table} does not exist"))
    }
}


================================================
FILE: src/sql/engine/local.rs
================================================
use std::borrow::Cow;
use std::collections::{BTreeMap, BTreeSet};
use std::slice;

use itertools::Itertools as _;
use serde::{Deserialize, Serialize};

use super::Catalog;
use crate::encoding::{self, Key as _, Value as _};
use crate::errinput;
use crate::error::Result;
use crate::sql::types::{Expression, Row, Rows, Table, Value};
use crate::storage::{self, mvcc};

/// SQL engine keys, using the Keycode order-preserving encoding. For
/// simplicity, table and column names are used directly as identifiers, instead
/// of e.g. numeric IDs. It is not possible to change table/column names, so
/// this is fine, if somewhat inefficient.
///
/// Uses Cow to allow encoding borrowed values but decoding owned values.
#[derive(Debug, Deserialize, Serialize)]
pub enum Key<'a> {
    /// A table schema, keyed by table name. The value is a `sql::types::Table`.
    Table(Cow<'a, str>),
    /// A column index entry, keyed by table name, column name, and index value.
    /// The value is a `BTreeSet` of `sql::types::Value` primary key values.
    Index(Cow<'a, str>, Cow<'a, str>, Cow<'a, Value>),
    /// A table row, keyed by table name and primary key value. The value is a
    /// `sql::types::Row`.
    Row(Cow<'a, str>, Cow<'a, Value>),
}

impl<'a> encoding::Key<'a> for Key<'a> {}

/// Key prefixes, allowing prefix scans of specific parts of the keyspace. These
/// must match the keys -- in particular, the enum variant indexes must match,
/// since it's part of the encoded key.
#[derive(Deserialize, Serialize)]
enum KeyPrefix<'a> {
    /// All table schemas.
    Table,
    /// All column index entries, keyed by table and column name.
    Index(Cow<'a, str>, Cow<'a, str>),
    /// All table rows, keyed by table name.
    Row(Cow<'a, str>),
}

impl<'a> encoding::Key<'a> for KeyPrefix<'a> {}

/// A SQL engine using local storage. This provides the main SQL storage logic.
/// The Raft SQL engine dispatches to this for node-local SQL storage, executing
/// the same writes across each nodes' instance of `Local`.
pub struct Local<E: storage::Engine + 'static> {
    /// The local MVCC storage engine.
    pub mvcc: mvcc::MVCC<E>,
}

impl<E: storage::Engine> Local<E> {
    /// Creates a new local SQL engine using the given storage engine.
    pub fn new(engine: E) -> Self {
        Self { mvcc: mvcc::MVCC::new(engine) }
    }

    /// Resumes a transaction from the given state. This is usually kept within
    /// `mvcc::Transaction`, but the Raft-based engine can't retain the MVCC
    /// transaction across requests since it may be executed on different leader
    /// nodes, so it instead keeps the state client-side in the session.
    pub fn resume(&self, state: mvcc::TransactionState) -> Result<Transaction<E>> {
        Ok(Transaction::new(self.mvcc.resume(state)?))
    }

    /// Gets an unversioned key, or None if it doesn't exist.
    pub fn get_unversioned(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
        self.mvcc.get_unversioned(key)
    }

    /// Sets an unversioned key.
    pub fn set_unversioned(&self, key: &[u8], value: Vec<u8>) -> Result<()> {
        self.mvcc.set_unversioned(key, value)
    }
}

impl<E: storage::Engine> super::Engine<'_> for Local<E> {
    type Transaction = Transaction<E>;

    fn begin(&self) -> Result<Self::Transaction> {
        Ok(Self::Transaction::new(self.mvcc.begin()?))
    }

    fn begin_read_only(&self) -> Result<Self::Transaction> {
        Ok(Self::Transaction::new(self.mvcc.begin_read_only()?))
    }

    fn begin_as_of(&self, version: mvcc::Version) -> Result<Self::Transaction> {
        Ok(Self::Transaction::new(self.mvcc.begin_as_of(version)?))
    }
}

/// A SQL transaction, wrapping an MVCC transaction.
pub struct Transaction<E: storage::Engine + 'static> {
    txn: mvcc::Transaction<E>,
}

impl<E: storage::Engine> Transaction<E> {
    /// Creates a new SQL transaction using the given MVCC transaction.
    fn new(txn: mvcc::Transaction<E>) -> Self {
        Self { txn }
    }

    /// Returns the transaction's internal state.
    pub fn state(&self) -> &mvcc::TransactionState {
        self.txn.state()
    }

    /// Fetches the matching primary keys for the given secondary index value,
    /// or an empty set if there is none.
    fn get_index(&self, table: &str, column: &str, value: &Value) -> Result<BTreeSet<Value>> {
        debug_assert!(self.has_index(table, column)?, "no index on {table}.{column}");
        Ok(self
            .txn
            .get(&Key::Index(table.into(), column.into(), value.into()).encode())?
            .map(|v| BTreeSet::decode(&v))
            .transpose()?
            .unwrap_or_default())
    }

    /// Fetches a single row by primary key, or None if it doesn't exist.
    fn get_row(&self, table: &str, id: &Value) -> Result<Option<Row>> {
        self.txn
            .get(&Key::Row(table.into(), id.into()).encode())?
            .map(|v| Row::decode(&v))
            .transpose()
    }

    /// Returns true if a secondary index exists for the given column.
    fn has_index(&self, table: &str, column: &str) -> Result<bool> {
        let table = self.must_get_table(table)?;
        Ok(table.columns.iter().find(|c| c.name == column).map(|c| c.index).unwrap_or(false))
    }

    /// Stores a secondary index entry for the given column value, replacing the
    /// existing entry if any.
    fn set_index(
        &self,
        table: &str,
        column: &str,
        value: &Value,
        ids: BTreeSet<Value>,
    ) -> Result<()> {
        debug_assert!(self.has_index(table, column)?, "no index on {table}.{column}");
        let key = Key::Index(table.into(), column.into(), value.into()).encode();
        if ids.is_empty() {
            self.txn.delete(&key)?;
        } else {
            self.txn.set(&key, ids.encode())?;
        }
        Ok(())
    }

    /// Returns all tables referencing a table, as (table, column index) pairs.
    /// This includes any references from the table itself.
    fn table_references(&self, table: &str) -> Result<Vec<(Table, Vec<usize>)>> {
        Ok(self
            .list_tables()?
            .into_iter()
            .map(|t| {
                let references = t
                    .columns
                    .iter()
                    .enumerate()
                    .filter(|(_, c)| c.references.as_deref() == Some(table))
                    .map(|(i, _)| i)
                    .collect_vec();
                (t, references)
            })
            .filter(|(_, references)| !references.is_empty())
            .collect())
    }
}

impl<E: storage::Engine> super::Transaction for Transaction<E> {
    fn state(&self) -> &mvcc::TransactionState {
        self.txn.state()
    }

    fn commit(self) -> Result<()> {
        self.txn.commit()
    }

    fn rollback(self) -> Result<()> {
        self.txn.rollback()
    }

    fn delete(&self, table: &str, ids: &[Value]) -> Result<()> {
        let table = self.must_get_table(table)?;
        let indexes = table.columns.iter().enumerate().filter(|(_, c)| c.index).collect_vec();

        // Check for foreign key references to the deleted rows.
        for (source, refs) in self.table_references(&table.name)? {
            let self_reference = source.name == table.name;
            for i in refs {
                let column = &source.columns[i];
                let mut source_ids = if i == source.primary_key {
                    // If the reference is from a primary key column, do a lookup.
                    self.get(&source.name, ids)?
                        .into_iter()
                        .map(|row| row.into_iter().nth(i).expect("short row"))
                        .collect()
                } else {
                    // Otherwise (commonly), do a secondary index lookup.
                    // All foreign keys have a secondary index.
                    self.lookup_index(&source.name, &column.name, ids)?
                };
                // We can ignore any references between the deleted rows,
                // including a row referencing itself.
                if self_reference {
                    for id in ids {
                        source_ids.remove(id);
                    }
                }
                // Error if the delete would violate referential integrity.
                if let Some(source_id) = source_ids.first() {
                    let table = source.name;
                    let column = &source.columns[source.primary_key].name;
                    return errinput!("row referenced by {table}.{column}={source_id}");
                }
            }
        }

        for id in ids {
            // Update any secondary index entries.
            if !indexes.is_empty()
                && let Some(row) = self.get_row(&table.name, id)?
            {
                for (i, column) in indexes.iter().copied() {
                    let mut ids = self.get_index(&table.name, &column.name, &row[i])?;
                    ids.remove(id);
                    self.set_index(&table.name, &column.name, &row[i], ids)?;
                }
            }

            // Delete the row.
            self.txn.delete(&Key::Row((&table.name).into(), id.into()).encode())?;
        }
        Ok(())
    }

    fn get(&self, table: &str, ids: &[Value]) -> Result<Vec<Row>> {
        ids.iter().filter_map(|id| self.get_row(table, id).transpose()).collect()
    }

    fn insert(&self, table: &str, rows: Vec<Row>) -> Result<()> {
        let table = self.must_get_table(table)?;
        for row in rows {
            // Insert the row.
            table.validate_row(&row, false, self)?;
            let id = &row[table.primary_key];
            self.txn.set(&Key::Row((&table.name).into(), id.into()).encode(), row.encode())?;

            // Update any secondary index entries.
            for (i, column) in table.columns.iter().enumerate().filter(|(_, c)| c.index) {
                let mut ids = self.get_index(&table.name, &column.name, &row[i])?;
                ids.insert(id.clone());
                self.set_index(&table.name, &column.name, &row[i], ids)?;
            }
        }
        Ok(())
    }

    fn lookup_index(&self, table: &str, column: &str, values: &[Value]) -> Result<BTreeSet<Value>> {
        debug_assert!(self.has_index(table, column)?, "no index on {table}.{column}");
        values.iter().map(|v| self.get_index(table, column, v)).flatten_ok().collect()
    }

    fn scan(&self, table: &str, filter: Option<Expression>) -> Result<Rows> {
        // TODO: this could be simpler if process_results() implemented Clone.
        let rows = self
            .txn
            .scan_prefix(&KeyPrefix::Row(table.into()).encode())
            .map(|result| result.and_then(|(_, value)| Row::decode(&value)));
        let Some(filter) = filter else {
            return Ok(Box::new(rows));
        };
        let rows = rows.filter_map(move |result| {
            result
                .and_then(|row| match filter.evaluate(Some(&row))? {
                    Value::Boolean(true) => Ok(Some(row)),
                    Value::Boolean(false) | Value::Null => Ok(None),
                    value => errinput!("filter returned {value}, expected boolean"),
                })
                .transpose()
        });
        Ok(Box::new(rows))
    }

    fn update(&self, table: &str, rows: BTreeMap<Value, Row>) -> Result<()> {
        let table = self.must_get_table(table)?;
        for (id, row) in rows {
            // If the primary key changes, we simply do a delete and insert.
            // This simplifies constraint validation.
            if id != row[table.primary_key] {
                self.delete(&table.name, &[id])?;
                self.insert(&table.name, vec![row])?;
                continue;
            }

            // Validate the row, but don't write it yet since we may need to
            // read the existing value to update secondary indexes.
            table.validate_row(&row, true, self)?;

            // Update indexes, knowing that the primary key has not changed.
            let indexes = table.columns.iter().enumerate().filter(|(_, c)| c.index).collect_vec();
            if !indexes.is_empty() {
                let old = self.get(&table.name, slice::from_ref(&id))?.remove(0);
                for (i, column) in indexes {
                    // If the value didn't change, we don't have to do anything.
                    if old[i] == row[i] {
                        continue;
                    }

                    // Remove the old value from the index entry.
                    let mut ids = self.get_index(&table.name, &column.name, &old[i])?;
                    ids.remove(&id);
                    self.set_index(&table.name, &column.name, &old[i], ids)?;

                    // Insert the new value into the index entry.
                    let mut ids = self.get_index(&table.name, &column.name, &row[i])?;
                    ids.insert(id.clone());
                    self.set_index(&table.name, &column.name, &row[i], ids)?;
                }
            }

            // Update the row.
            self.txn.set(&Key::Row((&table.name).into(), (&id).into()).encode(), row.encode())?;
        }
        Ok(())
    }
}

impl<E: storage::Engine> Catalog for Transaction<E> {
    fn create_table(&self, table: Table) -> Result<()> {
        if self.get_table(&table.name)?.is_some() {
            return errinput!("table {} already exists", table.name);
        }
        table.validate(self)?;
        self.txn.set(&Key::Table((&table.name).into()).encode(), table.encode())
    }

    fn drop_table(&self, table: &str, if_exists: bool) -> Result<bool> {
        let Some(table) = self.get_table(table)? else {
            if if_exists {
                return Ok(false);
            }
            return errinput!("table {table} does not exist");
        };

        // Check for foreign key references.
        if let Some((source, refs)) =
            self.table_references(&table.name)?.iter().find(|(t, _)| t.name != table.name)
        {
            return errinput!(
                "table {} is referenced from {}.{}",
                table.name,
                source.name,
                source.columns[refs[0]].name
            );
        }

        // Delete the table schema entry.
        self.txn.delete(&Key::Table((&table.name).into()).encode())?;

        // Delete the table rows.
        let prefix = &KeyPrefix::Row((&table.name).into()).encode();
        let mut keys = self.txn.scan_prefix(prefix).map_ok(|(key, _)| key);
        while let Some(key) = keys.next().transpose()? {
            self.txn.delete(&key)?;
        }

        // Delete any secondary index entries.
        for column in table.columns.iter().filter(|c| c.index) {
            let prefix = &KeyPrefix::Index((&table.name).into(), (&column.name).into()).encode();
            let mut keys = self.txn.scan_prefix(prefix).map_ok(|(key, _)| key);
            while let Some(key) = keys.next().transpose()? {
                self.txn.delete(&key)?;
            }
        }
        Ok(true)
    }

    fn get_table(&self, table: &str) -> Result<Option<Table>> {
        self.txn.get(&Key::Table(table.into()).encode())?.map(|v| Table::decode(&v)).transpose()
    }

    fn list_tables(&self) -> Result<Vec<Table>> {
        self.txn
            .scan_prefix(&KeyPrefix::Table.encode())
            .map(|r| r.and_then(|(_, v)| Table::decode(&v)))
            .collect()
    }
}


================================================
FILE: src/sql/engine/mod.rs
================================================
//! The SQL engine provides SQL data storage and access, as well as session and
//! transaction management. The `Local` engine provides node-local on-disk
//! storage, while the `Raft` engine submits commands through Raft consensus
//! before dispatching to the `Local` engine on each node.

mod engine;
mod local;
mod raft;

pub use engine::{Catalog, Engine, Transaction};
pub use local::{Key, Local};
pub use raft::{Raft, Status, Write};


================================================
FILE: src/sql/engine/raft.rs
================================================
use std::borrow::Cow;
use std::collections::{BTreeMap, BTreeSet};

use crossbeam::channel::Sender;
use serde::de::DeserializeOwned;
use serde::{Deserialize, Serialize};

use super::{Catalog, Engine as _, Transaction as _};
use crate::encoding::{self, Value as _, bincode};
use crate::errdata;
use crate::error::Result;
use crate::raft;
use crate::sql::types::{Expression, Row, Rows, Table, Value};
use crate::storage::{self, mvcc};

/// A read command, submitted via Raft and executed on the leader. Each command
/// corresponds to a SQL engine method and parameters. Uses Cows to allow
/// borrowed encoding and owned decoding.
#[derive(Debug, Serialize, Deserialize)]
pub enum Read<'a> {
    BeginReadOnly {
        as_of: Option<mvcc::Version>,
    },
    Status,

    Get {
        txn: Cow<'a, mvcc::TransactionState>,
        table: Cow<'a, str>,
        ids: Cow<'a, [Value]>,
    },
    LookupIndex {
        txn: Cow<'a, mvcc::TransactionState>,
        table: Cow<'a, str>,
        column: Cow<'a, str>,
        values: Cow<'a, [Value]>,
    },
    Scan {
        txn: Cow<'a, mvcc::TransactionState>,
        table: Cow<'a, str>,
        filter: Option<Expression>,
    },

    GetTable {
        txn: Cow<'a, mvcc::TransactionState>,
        table: Cow<'a, str>,
    },
    ListTables {
        txn: Cow<'a, mvcc::TransactionState>,
    },
}

impl encoding::Value for Read<'_> {}

/// A write command, submitted via Raft and executed on all nodes. Each command
/// corresponds to a SQL engine method and parameters. Uses Cows to allow
/// borrowed encoding and owned decoding.
#[derive(Debug, Serialize, Deserialize)]
pub enum Write<'a> {
    Begin,
    Commit(Cow<'a, mvcc::TransactionState>),
    Rollback(Cow<'a, mvcc::TransactionState>),

    Delete { txn: Cow<'a, mvcc::TransactionState>, table: Cow<'a, str>, ids: Cow<'a, [Value]> },
    Insert { txn: Cow<'a, mvcc::TransactionState>, table: Cow<'a, str>, rows: Vec<Row> },
    Update { txn: Cow<'a, mvcc::TransactionState>, table: Cow<'a, str>, rows: BTreeMap<Value, Row> },

    CreateTable { txn: Cow<'a, mvcc::TransactionState>, schema: Table },
    DropTable { txn: Cow<'a, mvcc::TransactionState>, table: Cow<'a, str>, if_exists: bool },
}

impl encoding::Value for Write<'_> {}

/// Raft SQL engine status.
#[derive(Serialize, Deserialize)]
pub struct Status {
    pub raft: raft::Status,
    pub mvcc: mvcc::Status,
}

/// A Raft-based SQL engine. This dispatches to the `Local` engine for local
/// storage and processing on each node, but sends read and write commands
/// through Raft for distributed consensus.
///
/// The `Raft` engine itself is simply a Raft client which sends `raft::Request`
/// to the local Raft node for processing. These requests are applied to the
/// Raft SQL engine's `State` state machine running below Raft on each node,
/// which executes the commands on a `Local` SQL engine using a
/// `storage::Engine` for local storage.
///
/// For more details on how SQL statements flow through the engine, see the
/// `sql` module documentation.
pub struct Raft {
    /// Sends requests to the local Raft node, along with a response channel.
    tx: Sender<(raft::Request, Sender<Result<raft::Response>>)>,
}

impl Raft {
    /// The unversioned key used to store the applied index. Just uses a string
    /// for simplicity.
    pub const APPLIED_INDEX_KEY: &'static [u8] = b"applied_index";

    /// Creates a new Raft-based SQL engine, with a channel to send requests to
    /// the local Raft node.
    pub fn new(tx: Sender<(raft::Request, Sender<Result<raft::Response>>)>) -> Self {
        Self { tx }
    }

    /// Creates the Raft-managed state machine for the Raft engine. Receives
    /// commands from the Raft engine and executes them on a `Local` engine.
    pub fn new_state<E: storage::Engine>(engine: E) -> Result<State<E>> {
        State::new(engine)
    }

    /// Executes a request against the Raft cluster, waiting for the response.
    fn request(&self, request: raft::Request) -> Result<raft::Response> {
        let (response_tx, response_rx) = crossbeam::channel::bounded(1);
        self.tx.send((request, response_tx))?;
        response_rx.recv()?
    }

    /// Writes through Raft, deserializing the response into the return type.
    fn write<V: DeserializeOwned>(&self, write: Write) -> Result<V> {
        match self.request(raft::Request::Write(write.encode()))? {
            raft::Response::Write(response) => bincode::deserialize(&response),
            response => errdata!("unexpected Raft write response {response:?}"),
        }
    }

    /// Reads from Raft, deserializing the response into the return type.
    fn read<V: DeserializeOwned>(&self, read: Read) -> Result<V> {
        match self.request(raft::Request::Read(read.encode()))? {
            raft::Response::Read(response) => bincode::deserialize(&response),
            response => errdata!("unexpected Raft read response {response:?}"),
        }
    }

    /// Raft SQL engine status.
    pub fn status(&self) -> Result<Status> {
        let raft = match self.request(raft::Request::Status)? {
            raft::Response::Status(status) => status,
            response => return errdata!("unexpected Raft status response {response:?}"),
        };
        let mvcc = self.read(Read::Status)?;
        Ok(Status { raft, mvcc })
    }
}

impl<'a> super::Engine<'a> for Raft {
    type Transaction = Transaction<'a>;

    fn begin(&'a self) -> Result<Self::Transaction> {
        Transaction::begin(self, false, None)
    }

    fn begin_read_only(&'a self) -> Result<Self::Transaction> {
        Transaction::begin(self, true, None)
    }

    fn begin_as_of(&'a self, version: mvcc::Version) -> Result<Self::Transaction> {
        Transaction::begin(self, true, Some(version))
    }
}

/// A Raft SQL engine transaction.
///
/// This keeps track of the transaction state in memory. An `mvcc::Transaction`
/// normally manages this, but since `mvcc::Transaction` runs below Raft, it
/// can't maintain this state between individual requests (which could execute
/// on different leaders). Instead, it uses `mvcc::Transaction::resume` to
/// resume the transaction from the provided transaction state for each request.
pub struct Transaction<'a> {
    /// The Raft SQL engine client, used to communicate with Raft.
    raft: &'a Raft,
    /// The MVCC transaction state.
    state: mvcc::TransactionState,
}

impl<'a> Transaction<'a> {
    /// Starts a transaction in the given mode.
    fn begin(raft: &'a Raft, read_only: bool, as_of: Option<mvcc::Version>) -> Result<Self> {
        assert!(as_of.is_none() || read_only, "can't use as_of without read_only");
        // Read-only transactions don't allocate a new MVCC version, so they
        // don't write anything -- they just grab the current transaction state.
        // Submit them as reads to avoid a replication roundtrip.
        let state = if read_only || as_of.is_some() {
            raft.read(Read::BeginReadOnly { as_of })?
        } else {
            raft.write(Write::Begin)?
        };
        Ok(Self { raft, state })
    }
}

impl super::Transaction for Transaction<'_> {
    fn state(&self) -> &mvcc::TransactionState {
        &self.state
    }

    fn commit(self) -> Result<()> {
        if self.state.read_only {
            return Ok(()); // noop
        }
        self.raft.write(Write::Commit(self.state.into()))
    }

    fn rollback(self) -> Result<()> {
        if self.state.read_only {
            return Ok(()); // noop
        }
        self.raft.write(Write::Rollback(self.state.into()))
    }

    fn delete(&self, table: &str, ids: &[Value]) -> Result<()> {
        self.raft.write(Write::Delete {
            txn: (&self.state).into(),
            table: table.into(),
            ids: ids.into(),
        })
    }

    fn get(&self, table: &str, ids: &[Value]) -> Result<Vec<Row>> {
        self.raft.read(Read::Get {
            txn: (&self.state).into(),
            table: table.into(),
            ids: ids.into(),
        })
    }

    fn insert(&self, table: &str, rows: Vec<Row>) -> Result<()> {
        self.raft.write(Write::Insert { txn: (&self.state).into(), table: table.into(), rows })
    }

    fn lookup_index(&self, table: &str, column: &str, values: &[Value]) -> Result<BTreeSet<Value>> {
        self.raft.read(Read::LookupIndex {
            txn: (&self.state).into(),
            table: table.into(),
            column: column.into(),
            values: values.into(),
        })
    }

    fn scan(&self, table: &str, filter: Option<Expression>) -> Result<Rows> {
        let scan: Vec<Row> = self.raft.read(Read::Scan {
            txn: (&self.state).into(),
            table: table.into(),
            filter,
        })?;
        Ok(Box::new(scan.into_iter().map(Ok)))
    }

    fn update(&self, table: &str, rows: BTreeMap<Value, Row>) -> Result<()> {
        self.raft.write(Write::Update { txn: (&self.state).into(), table: table.into(), rows })
    }
}

impl Catalog for Transaction<'_> {
    fn create_table(&self, schema: Table) -> Result<()> {
        self.raft.write(Write::CreateTable { txn: (&self.state).into(), schema })
    }

    fn drop_table(&self, table: &str, if_exists: bool) -> Result<bool> {
        self.raft.write(Write::DropTable {
            txn: (&self.state).into(),
            table: table.into(),
            if_exists,
        })
    }

    fn get_table(&self, table: &str) -> Result<Option<Table>> {
        self.raft.read(Read::GetTable { txn: (&self.state).into(), table: table.into() })
    }

    fn list_tables(&self) -> Result<Vec<Table>> {
        self.raft.read(Read::ListTables { txn: (&self.state).into() })
    }
}

/// The state machine for the Raft SQL engine. Receives commands via Raft and
/// dispatches to a `Local` SQL engine which does the actual work, using a
/// `storage::Engine` for storage.
///
/// For simplicity, we don't attempt to stream large requests or responses,
/// instead just delivering them as one large chunk. This means that e.g. a full
/// table scan will pull the entire table into memory, serialize it, and send it
/// across the network as one message, but that's fine for toyDB.
pub struct State<E: storage::Engine + 'static> {
    /// The local SQL engine, used for actual storage.
    local: super::Local<E>,
    /// The last applied index. This tells Raft which command to apply next.
    applied_index: raft::Index,
}

impl<E: storage::Engine> State<E> {
    /// Creates a new Raft state maching using the given storage engine for
    /// local storage.
    pub fn new(engine: E) -> Result<Self> {
        let local = super::Local::new(engine);
        let applied_index = local
            .get_unversioned(Raft::APPLIED_INDEX_KEY)?
            .map(|b| bincode::deserialize(&b))
            .transpose()?
            .unwrap_or_default();
        Ok(State { local, applied_index })
    }

    /// Executes a write command. This is executed on all nodes, but the
    /// response is returned from the Raft leader.
    ///
    /// The response is encoded using Bincode. The caller will know what
    /// response type to expect for each command and deserialize into it.
    fn write(&self, command: Write) -> Result<Vec<u8>> {
        Ok(match command {
            Write::Begin => self.local.begin()?.state().encode(),
            Write::Commit(txn) => {
                bincode::serialize(&self.local.resume(txn.into_owned())?.commit()?)
            }
            Write::Rollback(txn) => {
                bincode::serialize(&self.local.resume(txn.into_owned())?.rollback()?)
            }

            Write::Delete { txn, table, ids } => {
                bincode::serialize(&self.local.resume(txn.into_owned())?.delete(&table, &ids)?)
            }
            Write::Insert { txn, table, rows } => {
                bincode::serialize(&self.local.resume(txn.into_owned())?.insert(&table, rows)?)
            }
            Write::Update { txn, table, rows } => {
                bincode::serialize(&self.local.resume(txn.into_owned())?.update(&table, rows)?)
            }

            Write::CreateTable { txn, schema } => {
                bincode::serialize(&self.local.resume(txn.into_owned())?.create_table(schema)?)
            }
            Write::DropTable { txn, table, if_exists } => bincode::serialize(
                &self.local.resume(txn.into_owned())?.drop_table(&table, if_exists)?,
            ),
        })
    }
}

impl<E: storage::Engine> raft::State for State<E> {
    fn get_applied_index(&self) -> raft::Index {
        self.applied_index
    }

    fn apply(&mut self, entry: raft::Entry) -> Result<Vec<u8>> {
        assert_eq!(entry.index, self.applied_index + 1, "entry index not after applied index");

        let result = match &entry.command {
            Some(command) => match self.write(Write::decode(command)?) {
                // Panic on non-deterministic apply failures, to prevent node
                // state divergence. See `raft::State` docs for details.
                Err(e) if !e.is_deterministic() => panic!("non-deterministic apply failure: {e}"),
                result => result,
            },
            // Raft submits noop commands on leader changes. Ignore them, but
            // record the applied index below.
            None => Ok(Vec::new()),
        };

        // Persist the applied index. We don't have to flush, because it's ok to
        // lose a tail of the state machine writes (e.g. if the machine
        // crashes). Raft will replay the log from the last known applied index.
        self.applied_index = entry.index;
        self.local.set_unversioned(Raft::APPLIED_INDEX_KEY, bincode::serialize(&entry.index))?;
        result
    }

    fn read(&self, command: Vec<u8>) -> Result<Vec<u8>> {
        Ok(match Read::decode(&command)? {
            Read::BeginReadOnly { as_of } => {
                let txn = match as_of {
                    Some(version) => self.local.begin_as_of(version)?,
                    None => self.local.begin_read_only()?,
                };
                txn.state().encode()
            }
            Read::Status => self.local.mvcc.status()?.encode(),

            Read::Get { txn, table, ids } => {
                self.local.resume(txn.into_owned())?.get(&table, &ids)?.encode()
            }
            Read::LookupIndex { txn, table, column, values } => self
                .local
                .resume(txn.into_owned())?
                .lookup_index(&table, &column, &values)?
                .encode(),
            Read::Scan { txn, table, filter } => {
                // For simplicity, buffer the entire scan. See `State` comment.
                self.local
                    .resume(txn.into_owned())?
                    .scan(&table, filter)?
                    .collect::<Result<Vec<Row>>>()?
                    .encode()
            }

            Read::GetTable { txn, table } => {
                self.local.resume(txn.into_owned())?.get_table(&table)?.encode()
            }
            Read::ListTables { txn } => {
                self.local.resume(txn.into_owned())?.list_tables()?.encode()
            }
        })
    }
}


================================================
FILE: src/sql/execution/aggregator.rs
================================================
use std::collections::BTreeMap;

use itertools::Itertools as _;

use crate::error::Result;
use crate::sql::planner::Aggregate;
use crate::sql::types::{Expression, Row, Rows, Value};

/// Computes bucketed aggregates for input rows. For example, this query would
/// compute COUNT and SUM aggregates bucketed by category and brand:
///
/// SELECT COUNT(*), SUM(price) FROM products GROUP BY category, brand
pub struct Aggregator {
    /// GROUP BY expressions.
    group_by: Vec<Expression>,
    /// Aggregates to compute.
    aggregates: Vec<Aggregate>,
    /// Accumulators indexed by group_by bucket.
    buckets: BTreeMap<Vec<Value>, Vec<Accumulator>>,
}

impl Aggregator {
    /// Creates a new aggregator for the given GROUP BY buckets and aggregates.
    pub fn new(group_by: Vec<Expression>, aggregates: Vec<Aggregate>) -> Self {
        Self { group_by, aggregates, buckets: BTreeMap::new() }
    }

    /// Adds a row to the aggregator.
    pub fn add(&mut self, row: &Row) -> Result<()> {
        // Compute the bucket values.
        let bucket = self.group_by.iter().map(|expr| expr.evaluate(Some(row))).try_collect()?;

        // Look up the bucket accumulators, or create a new bucket.
        let accumulators = self
            .buckets
            .entry(bucket)
            .or_insert_with(|| self.aggregates.iter().map(Accumulator::new).collect())
            .iter_mut();

        // Collect expressions to evaluate.
        let exprs = self.aggregates.iter().map(|a| a.expr());

        // Accumulate the evaluated values.
        for (accumulator, expr) in accumulators.zip_eq(exprs) {
            accumulator.add(expr.evaluate(Some(row))?)?;
        }
        Ok(())
    }

    /// Adds rows to the aggregator.
    pub fn add_rows(&mut self, rows: Rows) -> Result<()> {
        for row in rows {
            self.add(&row?)?;
        }
        Ok(())
    }

    /// Returns a row iterator over the aggregate result.
    pub fn into_rows(self) -> Rows {
        // If there were no rows and no group_by expressions, return a row of
        // empty accumulators (e.g. SELECT COUNT(*) FROM t WHERE FALSE).
        if self.buckets.is_empty() && self.group_by.is_empty() {
            let result =
                self.aggregates.iter().map(Accumulator::new).map(|acc| acc.value()).try_collect();
            return Box::new(std::iter::once(result));
        }

        // Emit the group_by and aggregate values for each bucket. We use an
        // intermediate vec since btree_map::IntoIter doesn't implement Clone
        // (required by Rows).
        let buckets = self.buckets.into_iter().collect_vec();
        Box::new(buckets.into_iter().map(|(bucket, accumulators)| {
            bucket
                .into_iter()
                .map(Ok)
                .chain(accumulators.into_iter().map(|acc| acc.value()))
                .collect()
        }))
    }
}

/// Accumulates aggregate values. Uses an enum rather than a trait since we need
/// to keep these in a vector (could use boxed trait objects too).
#[derive(Clone)]
enum Accumulator {
    Average { count: i64, sum: Value },
    Count(i64),
    Max(Option<Value>),
    Min(Option<Value>),
    Sum(Option<Value>),
}

impl Accumulator {
    /// Creates a new accumulator from an aggregate kind.
    fn new(aggregate: &Aggregate) -> Self {
        match aggregate {
            Aggregate::Average(_) => Self::Average { count: 0, sum: Value::Integer(0) },
            Aggregate::Count(_) => Self::Count(0),
            Aggregate::Max(_) => Self::Max(None),
            Aggregate::Min(_) => Self::Min(None),
            Aggregate::Sum(_) => Self::Sum(None),
        }
    }

    /// Adds a value to the accumulator.
    fn add(&mut self, value: Value) -> Result<()> {
        // Aggregates ignore NULL values.
        if value == Value::Null {
            return Ok(());
        }
        match self {
            Self::Average { sum, count } => (*sum, *count) = (sum.checked_add(&value)?, *count + 1),
            Self::Count(count) => *count += 1,
            Self::Max(max @ None) => *max = Some(value),
            Self::Max(Some(max)) if value > *max => *max = value,
            Self::Max(Some(_)) => {}
            Self::Min(min @ None) => *min = Some(value),
            Self::Min(Some(min)) if value < *min => *min = value,
            Self::Min(Some(_)) => {}
            Self::Sum(sum @ None) => *sum = Some(Value::Integer(0).checked_add(&value)?),
            Self::Sum(Some(sum)) => *sum = sum.checked_add(&value)?,
        }
        Ok(())
    }

    /// Returns the aggregate value.
    fn value(self) -> Result<Value> {
        Ok(match self {
            Self::Average { count: 0, sum: _ } => Value::Null,
            Self::Average { count, sum } => sum.checked_div(&Value::Integer(count))?,
            Self::Count(count) => count.into(),
            Self::Max(Some(value)) | Self::Min(Some(value)) | Self::Sum(Some(value)) => value,
            Self::Max(None) | Self::Min(None) | Self::Sum(None) => Value::Null,
        })
    }
}


================================================
FILE: src/sql/execution/executor.rs
================================================
use std::cmp::Ordering;
use std::collections::{BTreeMap, HashMap};

use itertools::{Itertools as _, izip};

use super::aggregator::Aggregator;
use super::join::{HashJoiner, NestedLoopJoiner};
use crate::errinput;
use crate::error::Result;
use crate::sql::engine::Transaction;
use crate::sql::planner::{Direction, Node, Plan};
use crate::sql::types::{Expression, Label, Row, Rows, Table, Value};

/// Executes statement plans.
///
/// The plan root specifies the action to take (e.g. SELECT, INSERT, UPDATE,
/// etc). It has a nested tree of child nodes that process rows.
///
/// Nodes are executed recursively, and return row iterators. Parent nodes
/// recursively pull input rows from their child nodes, process them, and pass
/// them on to their parent node.
///
/// Below is an example of an (unoptimized) query plan:
///
/// SELECT title, released, genres.name AS genre
/// FROM movies INNER JOIN genres ON movies.genre_id = genres.id
/// WHERE released >= 2000
/// ORDER BY released
///
/// Select
/// └─ Order: movies.released desc
///    └─ Projection: movies.title, movies.released, genres.name as genre
///       └─ Filter: movies.released >= 2000
///          └─ NestedLoopJoin: inner on movies.genre_id = genres.id
///             ├─ Scan: movies
///             └─ Scan: genres
///
/// Rows flow from the tree leaves to the root:
///
/// 1. Scan nodes read rows from movies and genres.
/// 2. NestedLoopJoin joins the rows from movies and genres.
/// 3. Filter discards rows with release dates older than 2000.
/// 4. Projection picks out the requested column values from the rows.
/// 5. Order sorts the rows by release date.
/// 6. Select returns the final rows to the client.
pub struct Executor<'a, T: Transaction> {
    /// The transaction used to execute the plan.
    txn: &'a T,
}

impl<'a, T: Transaction> Executor<'a, T> {
    /// Creates a new executor.
    pub fn new(txn: &'a T) -> Self {
        Self { txn }
    }

    /// Executes a plan, returning an execution result.
    pub fn execute(&mut self, plan: Plan) -> Result<ExecutionResult> {
        Ok(match plan {
            // CREATE TABLE
            Plan::CreateTable { schema } => {
                let name = schema.name.clone();
                self.txn.create_table(schema)?;
                ExecutionResult::CreateTable { name }
            }

            // DROP TABLE
            Plan::DropTable { name, if_exists } => {
                let existed = self.txn.drop_table(&name, if_exists)?;
                ExecutionResult::DropTable { name, existed }
            }

            // DELETE
            Plan::Delete { table, primary_key, source } => {
                let source = self.execute_node(source)?;
                let count = self.delete(&table, primary_key, source)?;
                ExecutionResult::Delete { count }
            }

            // INSERT
            Plan::Insert { table, column_map, source } => {
                let source = self.execute_node(source)?;
                let count = self.insert(table, column_map, source)?;
                ExecutionResult::Insert { count }
            }

            // SELECT
            Plan::Select(root) => {
                let columns = (0..root.columns()).map(|i| root.column_label(i)).collect();
                let rows = self.execute_node(root)?;
                ExecutionResult::Select { columns, rows }
            }

            // UPDATE
            Plan::Update { table, primary_key, source, expressions } => {
                let source = self.execute_node(source)?;
                let count = self.update(&table.name, primary_key, source, expressions)?;
                ExecutionResult::Update { count }
            }
        })
    }

    /// Recursively executes a query plan node, returning a row iterator.
    fn execute_node(&mut self, node: Node) -> Result<Rows> {
        Ok(match node {
            // GROUP BY and aggregate functions.
            Node::Aggregate { source, group_by, aggregates } => {
                let source = self.execute_node(*source)?;
                let mut aggregator = Aggregator::new(group_by, aggregates);
                aggregator.add_rows(source)?;
                aggregator.into_rows()
            }

            // WHERE and similar filtering.
            Node::Filter { source, predicate } => {
                let source = self.execute_node(*source)?;
                Box::new(source.filter_map(move |result| {
                    result
                        .and_then(|row| match predicate.evaluate(Some(&row))? {
                            Value::Boolean(true) => Ok(Some(row)),
                            Value::Boolean(false) | Value::Null => Ok(None),
                            value => errinput!("filter returned {value}, expected boolean",),
                        })
                        .transpose()
                }))
            }

            // JOIN using a hash join.
            Node::HashJoin { left, left_column, right, right_column, outer } => {
                let right_columns = right.columns();
                let left = self.execute_node(*left)?;
                let right = self.execute_node(*right)?;
                Box::new(HashJoiner::new(
                    left,
                    left_column,
                    right,
                    right_column,
                    right_columns,
                    outer,
                )?)
            }

            // Looks up primary keys by secondary index values.
            Node::IndexLookup { table, column, values, alias: _ } => {
                let column = table.columns.into_iter().nth(column).expect("invalid column").name;
                let ids =
                    self.txn.lookup_index(&table.name, &column, &values)?.into_iter().collect_vec();
                Box::new(self.txn.get(&table.name, &ids)?.into_iter().map(Ok))
            }

            // Looks up rows by primary key.
            Node::KeyLookup { table, keys, alias: _ } => {
                Box::new(self.txn.get(&table.name, &keys)?.into_iter().map(Ok))
            }

            // LIMIT
            Node::Limit { source, limit } => Box::new(self.execute_node(*source)?.take(limit)),

            // JOIN using a nested loop join.
            Node::NestedLoopJoin { left, right, predicate, outer } => {
                let right_columns = right.columns();
                let left = self.execute_node(*left)?;
                let right = self.execute_node(*right)?;
                Box::new(NestedLoopJoiner::new(left, right, right_columns, predicate, outer))
            }

            // An empty row iterator.
            Node::Nothing { .. } => Box::new(std::iter::empty()),

            // OFFSET
            Node::Offset { source, offset } => Box::new(self.execute_node(*source)?.skip(offset)),

            // ORDER BY
            Node::Order { source, key } => {
                let source = self.execute_node(*source)?;
                Box::new(Self::order(source, key)?)
            }

            // Projects columns from the source, and evaluates expressions.
            Node::Projection { source, expressions, aliases: _ } => {
                let source = self.execute_node(*source)?;
                Box::new(source.map(move |result| {
                    let row = result?;
                    expressions.iter().map(|expr| expr.evaluate(Some(&row))).collect()
                }))
            }

            // Remaps source column indexes to new target column indexes.
            Node::Remap { source, targets } => {
                let source = self.execute_node(*source)?;
                let size = targets.iter().copied().flatten().map(|i| i + 1).max().unwrap_or(0);
                Box::new(source.map_ok(move |row| {
                    let mut remapped = vec![Value::Null; size];
                    for (target, value) in targets.iter().copied().zip_eq(row) {
                        if let Some(target) = target {
                            remapped[target] = value;
                        }
                    }
                    remapped
                }))
            }

            // Scans a table, optionally filtering rows.
            Node::Scan { table, filter, alias: _ } => Box::new(self.txn.scan(&table.name, filter)?),

            // Emits constant values.
            Node::Values { rows } => Box::new(
                rows.into_iter()
                    .map(|row| row.into_iter().map(|expr| expr.evaluate(None)).collect()),
            ),
        })
    }

    /// DELETE: deletes rows, taking primary keys from the source at the given
    /// primary_key column index. Returns the number of rows deleted.
    fn delete(&self, table: &str, primary_key: usize, source: Rows) -> Result<u64> {
        let ids: Vec<Value> = source
            .map_ok(|row| row.into_iter().nth(primary_key).expect("short row"))
            .try_collect()?;
        let count = ids.len() as u64;
        self.txn.delete(table, &ids)?;
        Ok(count)
    }

    /// INSERT: inserts rows into a table from the given source.
    ///
    /// If given, column_map contains the mapping of table → source columns for
    /// all columns in source. Otherwise, every column in source corresponds to
    /// those in table, but a tail of source columns may be missing.
    fn insert(
        &self,
        table: Table,
        column_map: Option<HashMap<usize, usize>>,
        mut source: Rows,
    ) -> Result<u64> {
        let mut rows = Vec::new();
        while let Some(values) = source.next().transpose()? {
            // Fast path: the row is already complete, with no column mapping.
            if values.len() == table.columns.len() && column_map.is_none() {
                rows.push(values);
                continue;
            }
            if values.len() > table.columns.len() {
                return errinput!("too many values for table {}", table.name);
            }
            if let Some(column_map) = &column_map
                && column_map.len() != values.len()
            {
                return errinput!("column and value counts do not match");
            }

            // Map source columns to table columns, and fill in default values.
            let mut row = Vec::with_capacity(table.columns.len());
            for (i, column) in table.columns.iter().enumerate() {
                if column_map.is_none() && i < values.len() {
                    // Pass through the source column to the table column.
                    row.push(values[i].clone())
                } else if let Some(vi) = column_map.as_ref().and_then(|c| c.get(&i)).copied() {
                    // Map the source column to the table column.
                    row.push(values[vi].clone())
                } else if let Some(default) = &column.default {
                    // Column not given in source, use the default.
                    row.push(default.clone())
                } else {
                    return errinput!("no value given for column {} with no default", column.name);
                }
            }
            rows.push(row);
        }
        let count = rows.len() as u64;
        self.txn.insert(&table.name, rows)?;
        Ok(count)
    }

    /// UPDATE: updates rows passed in from the source. Returns the number of
    /// rows updated.
    fn update(
        &self,
        table: &str,
        primary_key: usize,
        mut source: Rows,
        expressions: Vec<(usize, Expression)>,
    ) -> Result<u64> {
        let mut updates = BTreeMap::new();
        while let Some(row) = source.next().transpose()? {
            let mut update = row.clone();
            for (column, expr) in &expressions {
                update[*column] = expr.evaluate(Some(&row))?;
            }
            let id = row.into_iter().nth(primary_key).expect("short row");
            updates.insert(id, update);
        }
        let count = updates.len() as u64;
        self.txn.update(table, updates)?;
        Ok(count)
    }

    /// Sorts the input rows.
    fn order(source: Rows, order: Vec<(Expression, Direction)>) -> Result<Rows> {
        // We can't use sorted_by_cached_key(), since expression evaluation is
        // fallible, and since we may have to vary the sort direction of each
        // expression. Collect the rows and pre-computed sort keys into a vec.
        let mut rows: Vec<(Row, Vec<Value>)> = source
            .map(|result| {
                result.and_then(|row| {
                    let sort_keys =
                        order.iter().map(|(expr, _)| expr.evaluate(Some(&row))).try_collect()?;
                    Ok((row, sort_keys))
                })
            })
            .try_collect()?;

        rows.sort_by(|(_, a_keys), (_, b_keys)| {
            let dirs = order.iter().map(|(_, dir)| dir).copied();
            for (a_key, b_key, dir) in izip!(a_keys, b_keys, dirs) {
                let mut ordering = a_key.cmp(b_key);
                if dir == Direction::Descending {
                    ordering = ordering.reverse();
                }
                if ordering != Ordering::Equal {
                    return ordering;
                }
            }
            Ordering::Equal
        });

        Ok(Box::new(rows.into_iter().map(|(row, _)| Ok(row))))
    }
}

/// A plan execution result.
pub enum ExecutionResult {
    CreateTable { name: String },
    DropTable { name: String, existed: bool },
    Delete { count: u64 },
    Insert { count: u64 },
    Update { count: u64 },
    Select { columns: Vec<Label>, rows: Rows },
}


================================================
FILE: src/sql/execution/join.rs
================================================
use std::collections::HashMap;
use std::iter::Peekable;

use crate::errinput;
use crate::error::Result;
use crate::sql::types::{Expression, Row, Rows, Value};

/// NestedLoopJoiner implements nested loop joins.
///
/// For every row in the left source, iterate over the right source and join
/// them. Rows are filtered on the join predicate, if given.
///
/// If outer is true, and there are no matches in the right source for a row in
/// the left source, a joined row with NULL values for the right source is
/// returned (typically used for a LEFT JOIN).
///
/// This could be trivially implemented with carthesian_product(), but we need
/// to handle the left outer join case where there is no match in the right
/// source.
#[derive(Clone)]
pub struct NestedLoopJoiner {
    /// The left source.
    left: Peekable<Rows>,
    /// The right source.
    right: Rows,
    /// The original right iterator state. Can be cloned to reset the
    /// right source to its original state.
    right_original: Rows,
    /// The number of columns in the right source.
    right_columns: usize,
    /// True if a right match has been seen for the current left row.
    right_matched: bool,
    /// The join predicate.
    predicate: Option<Expression>,
    /// If true, emit a row when there is no match in the right source.
    outer: bool,
}

impl NestedLoopJoiner {
    /// Creates a new nested loop joiner.
    pub fn new(
        left: Rows,
        right: Rows,
        right_columns: usize,
        predicate: Option<Expression>,
        outer: bool,
    ) -> Self {
        let left = left.peekable();
        let right_original = right.clone();
        Self { left, right, right_original, right_columns, right_matched: false, predicate, outer }
    }

    // Returns the next joined row, if any.
    fn try_next(&mut self) -> Result<Option<Row>> {
        // While there is a valid left row, look for a right-hand match to return.
        while let Some(Ok(left)) = self.left.peek() {
            // If there is a match in the remaining right rows, return it.
            while let Some(right) = self.right.next().transpose()? {
                let row = left.iter().cloned().chain(right).collect();
                if let Some(predicate) = &self.predicate {
                    match predicate.evaluate(Some(&row))? {
                        Value::Boolean(true) => {}
                        Value::Boolean(false) | Value::Null => continue,
                        v => return errinput!("join predicate returned {v}, expected boolean"),
                    }
                }
                self.right_matched = true;
                return Ok(Some(row));
            }

            // If there was no right match for the left row, and this is an
            // outer join, emit a row with right NULLs.
            if !self.right_matched && self.outer {
                self.right_matched = true;
                return Ok(Some(
                    left.iter()
                        .cloned()
                        .chain(std::iter::repeat_n(Value::Null, self.right_columns))
                        .collect(),
                ));
            }

            // We reached the end of the right source. Reset it and move onto
            // the next left row.
            self.right = self.right_original.clone();
            self.right_matched = false;
            self.left.next().transpose()?;
        }

        // Otherwise, there's either a None or Err in left. Return it.
        self.left.next().transpose()
    }
}

impl Iterator for NestedLoopJoiner {
    type Item = Result<Row>;

    fn next(&mut self) -> Option<Self::Item> {
        self.try_next().transpose()
    }
}

/// HashJoiner implements hash joins.
///
/// This builds a hash table of rows from the right source keyed on the join
/// value, then iterates over the left source and looks up matching rows in the
/// hash table.
///
/// If outer is true, and there is no match in the right source for a row in the
/// left source, a row with NULL values for the right source is emitted instead.
#[derive(Clone)]
pub struct HashJoiner {
    /// The left source.
    left: Rows,
    /// The left column to join on.
    left_column: usize,
    /// The right hash map to join on.
    right: HashMap<Value, Vec<Row>>,
    /// The number of columns in the right source.
    right_columns: usize,
    /// If true, emit a row when there is no match in the right source.
    outer: bool,
    /// Any pending matches to emit.
    pending: Rows,
}

impl HashJoiner {
    /// Creates a new hash joiner.
    pub fn new(
        left: Rows,
        left_column: usize,
        mut right: Rows,
        right_column: usize,
        right_columns: usize,
        outer: bool,
    ) -> Result<Self> {
        // Build a hash map from the right source.
        let mut right_map: HashMap<Value, Vec<Row>> = HashMap::new();
        while let Some(row) = right.next().transpose()? {
            let value = row[right_column].clone();
            if value.is_undefined() {
                continue; // undefined will never match anything
            }
            right_map.entry(value).or_default().push(row);
        }

        let pending = Box::new(std::iter::empty());

        Ok(Self { left, left_column, right: right_map, right_columns, outer, pending })
    }

    // Returns the next joined row, if any.
    fn try_next(&mut self) -> Result<Option<Row>> {
        // If there's a pending row stashed from a previous call, return it.
        if let Some(row) = self.pending.next().transpose()? {
            return Ok(Some(row));
        }

        // Find the next left row to join with.
        while let Some(left) = self.left.next().transpose()? {
            if let Some(right) = self.right.get(&left[self.left_column]).cloned() {
                // Join with all right matches and stash them in pending.
                self.pending = Box::new(
                    right
                        .into_iter()
                        .map(move |right| left.iter().cloned().chain(right).collect())
                        .map(Ok),
                );
                return self.pending.next().transpose();
            } else if self.outer {
                // If there is no match for the left row, but it's an outer
                // join, emit a row with right NULLs.
                return Ok(Some(
                    left.into_iter()
                        .chain(std::iter::repeat_n(Value::Null, self.right_columns))
                        .collect(),
                ));
            }
        }

        Ok(None)
    }
}

impl Iterator for HashJoiner {
    type Item = Result<Row>;

    fn next(&mut self) -> Option<Self::Item> {
        self.try_next().transpose()
    }
}


================================================
FILE: src/sql/execution/mod.rs
================================================
//! Executes statements and plans.

mod aggregator;
mod executor;
mod join;
mod session;

pub use executor::{ExecutionResult, Executor};
pub use session::{Session, StatementResult};


================================================
FILE: src/sql/execution/session.rs
================================================
use itertools::Itertools as _;
use log::error;
use serde::{Deserialize, Serialize};

use crate::error::{Error, Result};
use crate::sql::engine::{Engine, Raft, Status, Transaction as _};
use crate::sql::execution::ExecutionResult;
use crate::sql::parser::{Parser, ast};
use crate::sql::planner::Plan;
use crate::sql::types::{Label, Row, Rows, Value};
use crate::storage::mvcc;
use crate::{errdata, errinput};

/// A SQL client session. Parses and executes raw SQL statements and handles
/// transaction control.
pub struct Session<'a, E: Engine<'a>> {
    /// The SQL engine.
    engine: &'a E,
    /// The current transaction, if any.
    txn: Option<E::Transaction>,
}

impl<'a, E: Engine<'a>> Session<'a, E> {
    /// Creates a new session using the given SQL engine.
    pub fn new(engine: &'a E) -> Self {
        Self { engine, txn: None }
    }

    /// Executes a client statement.
    pub fn execute(&mut self, statement: &str) -> Result<StatementResult> {
        // Parse and execute the statement. Transaction control is handled here,
        // other statements are handled by the SQL executor.
        Ok(match Parser::parse(statement)? {
            // BEGIN: starts a new transaction and returns its state.
            ast::Statement::Begin { read_only, as_of } => {
                if self.txn.is_some() {
                    return errinput!("already in a transaction");
                }
                let txn = match (read_only, as_of) {
                    (false, None) => self.engine.begin()?,
                    (true, None) => self.engine.begin_read_only()?,
                    (true, Some(as_of)) => self.engine.begin_as_of(as_of)?,
                    (false, Some(_)) => {
                        return errinput!("can't start read-write transaction in a given version");
                    }
                };
                let state = txn.state().clone();
                self.txn = Some(txn);
                StatementResult::Begin(state)
            }

            // COMMIT: commits the currently open transaction, if any.
            ast::Statement::Commit => {
                let Some(txn) = self.txn.take() else {
                    return errinput!("not in a transaction");
                };
                let version = txn.state().version;
                txn.commit()?;
                StatementResult::Commit { version }
            }

            // ROLLBACK: rolls back the currently open transaction, if any.
            ast::Statement::Rollback => {
                let Some(txn) = self.txn.take() else {
                    return errinput!("not in a transaction");
                };
                let version = txn.state().version;
                txn.rollback()?;
                StatementResult::Rollback { version }
            }

            // EXPLAIN: returns the given SQL query's plan.
            ast::Statement::Explain(statement) => self.with_txn(true, |txn| {
                Ok(StatementResult::Explain(Plan::build(*statement, txn)?.optimize()?))
            })?,

            // Other statements (SELECT etc.) are handled by the SQL executor.
            statement => {
                let read_only = matches!(statement, ast::Statement::Select { .. });
                self.with_txn(read_only, |txn| {
                    Plan::build(statement, txn)?.optimize()?.execute(txn)?.try_into()
                })?
            }
        })
    }

    /// Runs a closure in the session's transaction, if there is one, otherwise
    /// a temporary implicit transaction. If read_only is true, uses a read-only
    /// implicit transaction. Does not automatically retry errors.
    pub fn with_txn<F, T>(&mut self, read_only: bool, f: F) -> Result<T>
    where
        F: FnOnce(&mut E::Transaction) -> Result<T>,
    {
        // Use the current explicit transaction, if there is one.
        if let Some(ref mut txn) = self.txn {
            return f(txn);
        }
        // Otherwise, use an implicit transaction. Doing this session-side
        // results in additional Raft roundtrips to begin and complete the
        // transaction -- we could avoid this if the Raft SQL state machine
        // supported implicit transactions, but we keep it simple.
        let mut txn = match read_only {
            true => self.engine.begin_read_only()?,
            false => self.engine.begin()?,
        };
        let result = f(&mut txn);
        match result {
            Ok(_) => txn.commit()?,
            Err(_) => txn.rollback()?,
        }
        result
    }
}

impl Session<'_, Raft> {
    /// Returns the Raft SQL engine status.
    pub fn status(&self) -> Result<Status> {
        self.engine.status()
    }
}

/// If the session has an open transaction when dropped, roll it back.
impl<'a, E: Engine<'a>> Drop for Session<'a, E> {
    fn drop(&mut self) {
        let Some(txn) = self.txn.take() else { return };
        if let Err(error) = txn.rollback() {
            error!("implicit transaction rollback failed: {error}")
        }
    }
}

/// A session statement result, returned over the network to SQL clients.
#[derive(Debug, PartialEq, Serialize, Deserialize)]
pub enum StatementResult {
    Begin(mvcc::TransactionState),
    Commit { version: mvcc::Version },
    Rollback { version: mvcc::Version },
    Explain(Plan),
    CreateTable { name: String },
    DropTable { name: String, existed: bool },
    Delete { count: u64 },
    Insert { count: u64 },
    Update { count: u64 },
    // For simplicity, we buffer and send the entire set of rows as a vector
    // instead of streaming them to the client. Streaming reads haven't been
    // implemented from Raft either, so they're buffered all the way through.
    Select { columns: Vec<Label>, rows: Vec<Row> },
}

/// Converts an execution result into a statement result.
impl TryFrom<ExecutionResult> for StatementResult {
    type Error = Error;

    fn try_from(result: ExecutionResult) -> Result<Self> {
        Ok(match result {
            ExecutionResult::CreateTable { name } => Self::CreateTable { name },
            ExecutionResult::DropTable { name, existed } => Self::DropTable { name, existed },
            ExecutionResult::Delete { count } => Self::Delete { count },
            ExecutionResult::Insert { count } => Self::Insert { count },
            ExecutionResult::Update { count } => Self::Update { count },
            ExecutionResult::Select { rows, columns } => {
                // We buffer the entire set of rows, for simplicity.
                Self::Select { columns, rows: rows.try_collect()? }
            }
        })
    }
}

/// Attempts to convert a SELECT result into a row iterator.
impl TryFrom<StatementResult> for Rows {
    type Error = Error;

    fn try_from(result: StatementResult) -> Result<Self> {
        let StatementResult::Select { rows, .. } = result else {
            return errdata!("expected select result, found {result:?}");
        };
        Ok(Box::new(rows.into_iter().map(Ok)))
    }
}

/// Extracts the first row from a SELECT result.
impl TryFrom<StatementResult> for Row {
    type Error = Error;

    fn try_from(result: StatementResult) -> Result<Self> {
        let mut rows: Rows = result.try_into()?;
        rows.next().transpose()?.ok_or_else(|| errdata!("no rows returned"))
    }
}

/// Extracts the value of the first column in the first row.
impl TryFrom<StatementResult> for Value {
    type Error = Error;

    fn try_from(result: StatementResult) -> Result<Self> {
        let row: Row = result.try_into()?;
        row.into_iter().next().ok_or_else(|| errdata!("no columns returned"))
    }
}

/// Extracts the first boolean value of the first column in the first row.
impl TryFrom<StatementResult> for bool {
    type Error = Error;

    fn try_from(result: StatementResult) -> Result<Self> {
        Value::try_from(result)?.try_into()
    }
}

/// Extracts the first f64 value of the first column in the first row.
impl TryFrom<StatementResult> for f64 {
    type Error = Error;

    fn try_from(result: StatementResult) -> Result<Self> {
        Value::try_from(result)?.try_into()
    }
}

/// Extracts the first i64 value of the first column in the first row.
impl TryFrom<StatementResult> for i64 {
    type Error = Error;

    fn try_from(result: StatementResult) -> Result<Self> {
        Value::try_from(result)?.try_into()
    }
}

/// Extracts the first string value of the first column in the first row.
impl TryFrom<StatementResult> for String {
    type Error = Error;

    fn try_from(result: StatementResult) -> Result<Self> {
        Value::try_from(result)?.try_into()
    }
}


================================================
FILE: src/sql/mod.rs
================================================
//! Implements a SQL execution engine. A SQL statement flows through the engine
//! as follows:
//!
//! 1. The `toySQL` client connects to the server, which creates a new
//!    `sql::execution::Session` in `Server::sql_session`.
//!
//! 2. `toySQL` submits a SQL `SELECT` string, which the server executes via
//!    `Session::execute`.
//!
//! 3. `Session::execute` calls `Parser::parse` to parse the SQL `SELECT` string
//!    into an `ast::Statement::Select` AST (Abstract Syntax Tree). The parser
//!    uses the `Lexer` for initial tokenization.
//!     
//! 4. `Session::execute` obtains a new read-only `sql::engine::Transaction` via
//!    `Session::with_txn`. We'll gloss over the details here.
//!
//! 5. `Session::execute` calls `Plan::build` to construct an execution plan
//!    from the AST via the `Planner`, using the `Transaction`'s
//!    `sql::engine::Catalog` trait to look up table schema information.
//!
//! 6. `Session::execute` calls `Plan::optimize` to optimize the execution plan
//!    via the optimizers in `sql::planner::optimizer`. This e.g. performs
//!    filter pushdown to filter rows during storage scans, uses secondary
//!    indexes where appropriate, and chooses more efficient join types.
//!
//! 7. `Session::execute` calls `Plan::execute` to actually execute the plan,
//!    using the `Transaction` to access the `sql::engine::Engine`.  It uses the
//!    executors in `sql::execution` to recursively execute the
//!    `sql::planner::Node` nodes, which stream and process `sql::types::Row`
//!    vectors via `sql::types::Rows` iterators.
//!
//! 8. At the tip of the execution plan there's typically a `Node::Scan` which
//!    performs full table scans from storage. It is executed by
//!    `sql::execution::source::scan`, which calls `Transaction::scan`.
//!
//! 9. The upper `sql::engine::Raft` engine submits a `Read::Scan` request to
//!    Raft via `Raft::read` and `Raft::execute`. This is submitted through the
//!    crossbeam channel `Raft::tx`, which is routed to the local Raft node in
//!    `Server::raft_route` via `raft::Node::step`.
//!
//! 10. We'll skip Raft details, but see the `raft` module documentation. The
//!     `Read::Scan` request eventually makes its way to the SQL state machine
//!     `sql::engine::raft::State` that's managed by Raft. Since this is a read
//!     request, it is executed only on the leader node, calling `State::read`.
//!
//! 11. `State` wraps the `sql::engine::Local` SQL execution engine that runs
//!     on each node, using local storage. `State::read` calls
//!     `Transaction::scan` using a `Local::Transaction`.
//!
//! 12. The `Local` engine uses a `storage::BitCask` engine for local storage,
//!     with `storage::mvcc` providing transactions. See their documentation
//!     for details.
//!
//! 13. `Transaction::scan` uses `sql::engine::KeyPrefix::Table` to obtain the
//!     key prefix for the scanned table, encoded via `encoding::keycode`. It
//!     scans rows under this prefix by calling `MVCC::scan_prefix`, which in
//!     turn dispatches to `BitCask::scan_prefix`. It returns a row iterator.
//!
//! 14. A row iterator is propagated back up through the stack:
//!     `BitCask` → `MVCC` → `Local` → `State` → `Raft` → `scan` → `Plan::execute`
//!
//! 15. `Plan::execute` collects the results in a `ExecutionResult::Select`,
//!     and returns it to `Session::execute`. It in turns returns it to
//!     `Server::sql_session`, which encodes it and sends it across the wire
//!     to `toySQL`, which displays them to the user.
//!
//! TODO: expand this into a "Life of a SQL statement" document.

pub mod engine;
pub mod execution;
pub mod parser;
pub mod planner;
pub mod types;

/// SQL tests are implemented as goldenscripts under src/sql/testscripts.
#[cfg(test)]
mod tests {
    use std::collections::HashMap;
    use std::error::Error;
    use std::fmt::Write as _;
    use std::path::Path;
    use std::result::Result;

    use crossbeam::channel::Receiver;
    use itertools::Itertools as _;
    use tempfile::TempDir;
    use test_each_file::test_each_path;

    use super::engine::Catalog as _;
    use super::execution::{Session, StatementResult};
    use super::parser::Parser;
    use super::planner::{OPTIMIZERS, Plan};
    use crate::encoding::format::{self, Formatter as _};
    use crate::sql::engine::{Engine, Local};
    use crate::sql::planner::{Planner, Scope};
    use crate::storage::engine::test as testengine;
    use crate::storage::{self, Engine as _};

    // Run goldenscript tests in src/sql/testscripts.
    test_each_path! { in "src/sql/testscripts/expressions" as expressions => test_goldenscript_expr }
    test_each_path! { in "src/sql/testscripts/optimizers" as optimizers => test_goldenscript }
    test_each_path! { in "src/sql/testscripts/queries" as queries => test_goldenscript }
    test_each_path! { in "src/sql/testscripts/schema" as schema => test_goldenscript }
    test_each_path! { in "src/sql/testscripts/transactions" as transactions => test_goldenscript }
    test_each_path! { in "src/sql/testscripts/writes" as writes => test_goldenscript }

    /// Runs SQL goldenscripts.
    fn test_goldenscript(path: &Path) {
        // The runner's Session can't borrow from an Engine in the same struct,
        // so pass an engine reference. Use both BitCask and Memory engines and
        // mirror operations across them. Emit engine operations to op_rx.
        let (op_tx, op_rx) = crossbeam::channel::unbounded();
        let tempdir = TempDir::with_prefix("toydb").expect("tempdir failed");
        let bitcask =
            storage::BitCask::new(tempdir.path().join("bitcask")).expect("bitcask failed");
        let memory = storage::Memory::new();
        let engine =
            Local::new(testengine::Emit::new(testengine::Mirror::new(bitcask, memory), op_tx));
        let mut runner = SQLRunner::new(&engine, op_rx);

        goldenscript::run(&mut runner, path).expect("goldenscript failed")
    }

    /// Runs expression goldenscripts.
    fn test_goldenscript_expr(path: &Path) {
        goldenscript::run(&mut ExpressionRunner, path).expect("goldenscript failed")
    }

    /// The SQL test runner.
    struct SQLRunner<'a> {
        engine: &'a TestEngine,
        sessions: HashMap<String, Session<'a, TestEngine>>,
        op_rx: Receiver<testengine::Operation>,
    }

    type TestEngine =
        Local<testengine::Emit<testengine::Mirror<storage::BitCask, storage::Memory>>>;

    impl<'a> SQLRunner<'a> {
        fn new(engine: &'a TestEngine, op_rx: Receiver<testengine::Operation>) -> Self {
            Self { engine, sessions: HashMap::new(), op_rx }
        }
    }

    impl goldenscript::Runner for SQLRunner<'_> {
        fn run(&mut self, command: &goldenscript::Command) -> Result<String, Box<dyn Error>> {
            let mut output = String::new();

            // Obtain a session based on the command prefix ("" if none).
            let prefix = command.prefix.clone().unwrap_or_default();
            let session = self.sessions.entry(prefix).or_insert_with(|| self.engine.session());

            // Handle runner commands.
            match command.name.as_str() {
                // dump
                "dump" => {
                    command.consume_args().reject_rest()?;
                    let mut engine = self.engine.mvcc.engine.lock().expect("mutex failed");
                    let mut iter = engine.scan(..);
                    while let Some((key, value)) = iter.next().transpose()? {
                        let fmtkv = format::MVCC::<format::SQL>::key_value(&key, &value);
                        let rawkv = format::Raw::key_value(&key, &value);
                        writeln!(output, "{fmtkv} [{rawkv}]",)?;
                    }
                    return Ok(output);
                }

                // schema [TABLE...]
                "schema" => {
                    let mut args = command.consume_args();
                    let tables = args.rest_pos().iter().map(|arg| arg.value.clone()).collect_vec();
                    args.reject_rest()?;

                    let schemas = if tables.is_empty() {
                        session.with_txn(true, |txn| txn.list_tables())?
                    } else {
                        tables
                            .into_iter()
                            .map(|t| session.with_txn(true, |txn| txn.must_get_table(&t)))
                            .try_collect()?
                    };
                    return Ok(schemas.into_iter().join("\n"));
                }

                // Otherwise, fall through to SQL execution.
                _ => {}
            }

            // The entire command is the SQL statement. There are no args.
            if !command.args.is_empty() {
                return Err("SQL statements should be given as a command with no args".into());
            }
            let input = &command.name;
            let mut tags = command.tags.clone();

            // Output the plan if requested.
            if tags.remove("plan") {
                let ast = Parser::parse(input)?;
                let plan =
                    session.with_txn(true, |txn| Planner::new(txn).build(ast)?.optimize())?;
                writeln!(output, "{plan}")?;
            }

            // Output plan optimizations if requested.
            if tags.remove("opt") {
                if tags.contains("plan") {
                    return Err("using both plan and opt is redundant".into());
                }
                let ast = Parser::parse(input)?;
                let plan = session.with_txn(true, |txn| Planner::new(txn).build(ast))?;
                let Plan::Select(mut root) = plan else {
                    return Err("can only use opt with SELECT plans".into());
                };
                writeln!(output, "{}", format!("Initial:\n{root}").replace('\n', "\n   "))?;
                for optimizer in OPTIMIZERS.iter() {
                    let prev = root.clone();
                    root = optimizer.optimize(root)?;
                    if root != prev {
                        writeln!(
                            output,
                            "{}",
                            format!("{optimizer:?}:\n{root}").replace('\n', "\n   ")
                        )?;
                    }
                }
            }

            // Execute the statement.
            let result = session.execute(input)?;

            // Output engine ops if requested.
            if tags.remove("ops") {
                while let Ok(op) = self.op_rx.try_recv() {
                    match op {
                        testengine::Operation::Delete { key } => {
                            let fmtkey = format::MVCC::<format::SQL>::key(&key);
                            let rawkey = format::Raw::key(&key);
                            writeln!(output, "delete {fmtkey} [{rawkey}]")?;
                        }
                        testengine::Operation::Flush => writeln!(output, "flush")?,
                        testengine::Operation::Set { key, value } => {
                            let fmtkv = format::MVCC::<format::SQL>::key_value(&key, &value);
                            let rawkv = format::Raw::key_value(&key, &value);
                            writeln!(output, "set {fmtkv} [{rawkv}]")?;
                        }
                    }
                }
            }

            // Output the result if requested. SELECT results are always output.
            match result {
                StatementResult::Select { columns, rows } => {
                    if tags.remove("header") {
                        writeln!(output, "{}", columns.into_iter().join(", "))?;
                    }
                    for row in rows {
                        writeln!(output, "{}", row.into_iter().join(", "))?;
                    }
                }
                result if tags.remove("result") => writeln!(output, "{result:?}")?,
                _ => {}
            }

            // Reject unknown tags.
            if let Some(tag) = tags.iter().next() {
                return Err(format!("unknown tag {tag}").into());
            }

            Ok(output)
        }

        /// Drain unprocessed operations after each command.
        fn end_command(&mut self, _: &goldenscript::Command) -> Result<String, Box<dyn Error>> {
            while self.op_rx.try_recv().is_ok() {}
            Ok(String::new())
        }
    }

    /// A test runner for expressions. Evaluates expressions to values, and
    /// optionally emits the expression tree.
    struct ExpressionRunner;

    type Catalog<'a> = <Local<storage::Memory> as Engine<'a>>::Transaction;

    impl goldenscript::Runner for ExpressionRunner {
        fn run(&mut self, command: &goldenscript::Command) -> Result<String, Box<dyn Error>> {
            let mut output = String::new();

            // The entire command is the expression to evaluate. There are no args.
            if !command.args.is_empty() {
                return Err("expressions should be given as a command with no args".into());
            }
            let input = &command.name;
            let mut tags = command.tags.clone();

            // Parse and build the expression.
            let ast = Parser::parse_expr(input)?;
            let expr = Planner::<Catalog>::build_expression(ast, &Scope::new())?;

            // Evaluate the expression.
            let value = expr.evaluate(None)?;
            write!(output, "{value}")?;

            // If requested, convert the expression to conjunctive normal form
            // and dump it. Assert that it produces the same result.
            if tags.remove("cnf") {
                let cnf = expr.clone().into_cnf();
                assert_eq!(value, cnf.evaluate(None)?, "CNF result differs");
                write!(output, " ← {cnf}")?;
            }

            // If requested, debug-dump the parsed expression.
            if tags.remove("expr") {
                write!(output, " ← {:?}", expr)?;
            }
            writeln!(output)?;

            // Reject unknown tags.
            if let Some(tag) = tags.iter().next() {
                return Err(format!("unknown tag {tag}").into());
            }

            Ok(output)
        }
    }
}


================================================
FILE: src/sql/parser/ast.rs
================================================
use std::collections::BTreeMap;
use std::hash::{Hash, Hasher};

use crate::sql::types::DataType;

/// SQL statements are represented as an Abstract Syntax Tree (AST). The
/// statement is the root node of this tree, and describes the syntactic
/// structure of a SQL statement. It is built from a raw SQL string by the
/// parser, and passed on to the planner which validates it and builds an
/// execution plan from it.
#[derive(Debug)]
pub enum Statement {
    /// BEGIN: begins a new transaction.
    Begin {
        /// READ ONLY: if true, begin a read-only transaction.
        read_only: bool,
        /// AS OF: if given, the MVCC version to read at.
        as_of: Option<u64>,
    },
    /// COMMIT: commits a transaction.
    Commit,
    /// ROLLBACK: rolls back a transaction.
    Rollback,
    /// EXPLAIN: explains a SQL statement's execution plan.
    Explain(Box<Statement>),
    /// CREATE TABLE: creates a new table.
    CreateTable {
        /// The table name.
        name: String,
        /// Column specifications.
        columns: Vec<Column>,
    },
    /// DROP TABLE: drops a table.
    DropTable {
        /// The table to drop.
        name: String,
        /// IF EXISTS: if true, don't error if the table doesn't exist.
        if_exists: bool,
    },
    /// DELETE: deletes rows from a table.
    Delete {
        /// The table to delete from.
        table: String,
        /// WHERE: optional condition to match rows to delete.
        r#where: Option<Expression>,
    },
    /// INSERT INTO: inserts new rows into a table.
    Insert {
        /// Table to insert into.
        table: String,
        /// Columns to insert values into. If None, all columns are used.
        columns: Option<Vec<String>>,
        /// Row values to insert.
        values: Vec<Vec<Expression>>,
    },
    /// UPDATE: updates rows in a table.
    Update {
        table: String,
        set: BTreeMap<String, Option<Expression>>, // column → value, None for default value
        r#where: Option<Expression>,
    },
    /// SELECT: selects rows, possibly from a table.
    Select {
        /// Expressions to select, with an optional column alias.
        select: Vec<(Expression, Option<String>)>,
        /// FROM: tables to select from.
        from: Vec<From>,
        /// WHERE: optional condition to filter rows.
        r#where: Option<Expression>,
        /// GROUP BY: expressions to group and aggregate by.
        group_by: Vec<Expression>,
        /// HAVING: expression to filter groups by.
        having: Option<Expression>,
        /// ORDER BY: expresisions to sort by, with direction.
        order_by: Vec<(Expression, Direction)>,
        /// OFFSET: row offset to start from.
        offset: Option<Expression>,
        /// LIMIT: maximum number of rows to return.
        limit: Option<Expression>,
    },
}

/// A FROM item.
#[derive(Debug)]
pub enum From {
    /// A table.
    Table {
        /// The table name.
        name: String,
        /// An optional alias for the table.
        alias: Option<String>,
    },
    /// A join of two or more tables (may be nested).
    Join {
        /// The left table to join,
        left: Box<From>,
        /// The right table to join.
        right: Box<From>,
        /// The join type.
        r#type: JoinType,
        /// The join condition. None for a cross join.
        predicate: Option<Expression>,
    },
}

/// A CREATE TABLE column definition.
#[derive(Debug)]
pub struct Column {
    pub name: String,
    pub datatype: DataType,
    pub primary_key: bool,
    pub nullable: Option<bool>,
    pub default: Option<Expression>,
    pub unique: bool,
    pub index: bool,
    pub references: Option<String>,
}

/// JOIN types.
#[derive(Debug, PartialEq)]
pub enum JoinType {
    Cross,
    Inner,
    Left,
    Right,
}

impl JoinType {
    // If true, the join is an outer join, where rows with no join matches are
    // emitted with a NULL match.
    pub fn is_outer(&self) -> bool {
        match self {
            Self::Left | Self::Right => true,
            Self::Cross | Self::Inner => false,
        }
    }
}

/// ORDER BY direction.
#[derive(Debug, Default)]
pub enum Direction {
    #[default]
    Ascending,
    Descending,
}

/// SQL expressions, e.g. `a + 7 > b`. Can be nested.
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub enum Expression {
    /// All columns, i.e. *.
    All,
    /// A column reference, optionally qualified with a table name.
    Column(Option<String>, String),
    /// A literal value.
    Literal(Literal),
    /// A function call (name and parameters).
    Function(String, Vec<Expression>),
    /// An operator.
    Operator(Operator),
}

/// Expression literal values.
#[derive(Clone, Debug)]
pub enum Literal {
    Null,
    Boolean(bool),
    Integer(i64),
    Float(f64),
    String(String),
}

/// To allow using expressions and literals in e.g. hashmaps, implement simple
/// equality by value for all types, including Null and f64::NAN. This only
/// checks that the values are the same, and ignores SQL semantics for e.g. NULL
/// and NaN (which is handled by SQL expression evaluation).
impl PartialEq for Literal {
    fn eq(&self, other: &Self) -> bool {
        match (self, other) {
            (Self::Null, Self::Null) => true,
            (Self::Boolean(l), Self::Boolean(r)) => l == r,
            (Self::Integer(l), Self::Integer(r)) => l == r,
            (Self::Float(l), Self::Float(r)) => l.to_bits() == r.to_bits(),
            (Self::String(l), Self::String(r)) => l == r,
            (_, _) => false,
        }
    }
}

impl Eq for Literal {}

impl Hash for Literal {
    fn hash<H: Hasher>(&self, state: &mut H) {
        core::mem::discriminant(self).hash(state);
        match self {
            Self::Null => {}
            Self::Boolean(v) => v.hash(state),
            Self::Integer(v) => v.hash(state),
            Self::Float(v) => v.to_bits().hash(state),
            Self::String(v) => v.hash(state),
        }
    }
}

/// Expression operators.
///
/// Since this is a recursive data structure, we have to box each child
/// expression, which incurs a heap allocation. There are clever ways to get
/// around this, but we keep it simple.
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub enum Operator {
    And(Box<Expression>, Box<Expression>), // a AND b
    Not(Box<Expression>),                  // NOT a
    Or(Box<Expression>, Box<Expression>),  // a OR b

    Equal(Box<Expression>, Box<Expression>),       // a = b
    GreaterThan(Box<Expression>, Box<Expression>), // a > b
    GreaterThanOrEqual(Box<Expression>, Box<Expression>), // a >= b
    Is(Box<Expression>, Literal),                  // IS NULL or IS NAN
    LessThan(Box<Expression>, Box<Expression>),    // a < b
    LessThanOrEqual(Box<Expression>, Box<Expression>), // a <= b
    NotEqual(Box<Expression>, Box<Expression>),    // a != b

    Add(Box<Expression>, Box<Expression>),          // a + b
    Divide(Box<Expression>, Box<Expression>),       // a / b
    Exponentiate(Box<Expression>, Box<Expression>), // a ^ b
    Factorial(Box<Expression>),                     // a!
    Identity(Box<Expression>),                      // +a
    Multiply(Box<Expression>, Box<Expression>),     // a * b
    Negate(Box<Expression>),                        // -a
    Remainder(Box<Expression>, Box<Expression>),    // a % b
    Subtract(Box<Expression>, Box<Expression>),     // a - b

    Like(Box<Expression>, Box<Expression>), // a LIKE b
}

impl Expression {
    /// Walks the expression tree depth-first, calling a closure for every node.
    /// Halts and returns false if the closure returns false.
    pub fn walk(&self, visitor: &mut impl FnMut(&Expression) -> bool) -> bool {
        use Operator::*;

        if !visitor(self) {
            return false;
        }

        match self {
            Self::Operator(op) => match op {
                Add(lhs, rhs)
                | And(lhs, rhs)
                | Divide(lhs, rhs)
                | Equal(lhs, rhs)
                | Exponentiate(lhs, rhs)
                | GreaterThan(lhs, rhs)
                | GreaterThanOrEqual(lhs, rhs)
                | LessThan(lhs, rhs)
                | LessThanOrEqual(lhs, rhs)
                | Like(lhs, rhs)
                | Multiply(lhs, rhs)
                | NotEqual(lhs, rhs)
                | Or(lhs, rhs)
                | Remainder(lhs, rhs)
                | Subtract(lhs, rhs) => lhs.walk(visitor) && rhs.walk(visitor),

                Factorial(expr) | Identity(expr) | Is(expr, _) | Negate(expr) | Not(expr) => {
                    expr.walk(visitor)
                }
            },

            Self::Function(_, exprs) => exprs.iter().any(|expr| expr.walk(visitor)),

            Self::All | Self::Column(_, _) | Self::Literal(_) => true,
        }
    }

    /// Walks the expression tree depth-first while calling a closure until it
    /// returns true. This is the inverse of walk().
    pub fn contains(&self, visitor: &impl Fn(&Expression) -> bool) -> bool {
        !self.walk(&mut |expr| !visitor(expr))
    }

    /// Find and collects expressions for which the given closure returns true,
    /// adding them to c. Does not recurse into matching expressions.
    pub fn collect(&self, visitor: &impl Fn(&Expression) -> bool, exprs: &mut Vec<Expression>) {
        use Operator::*;

        if visitor(self) {
            exprs.push(self.clone());
            return;
        }

        match self {
            Self::Operator(op) => match op {
                Add(lhs, rhs)
                | And(lhs, rhs)
                | Divide(lhs, rhs)
                | Equal(lhs, rhs)
                | Exponentiate(lhs, rhs)
                | GreaterThan(lhs, rhs)
                | GreaterThanOrEqual(lhs, rhs)
                | LessThan(lhs, rhs)
                | LessThanOrEqual(lhs, rhs)
                | Like(lhs, rhs)
                | Multiply(lhs, rhs)
                | NotEqual(lhs, rhs)
                | Or(lhs, rhs)
                | Remainder(lhs, rhs)
                | Subtract(lhs, rhs) => {
                    lhs.collect(visitor, exprs);
                    rhs.collect(visitor, exprs);
                }
                Factorial(expr) | Identity(expr) | Is(expr, _) | Negate(expr) | Not(expr) => {
                    expr.collect(visitor, exprs);
                }
            },

            Self::Function(_, args) => args.iter().for_each(|arg| arg.collect(visitor, exprs)),

            Self::All | Self::Column(_, _) | Self::Literal(_) => {}
        }
    }
}

impl core::convert::From<Literal> for Expression {
    fn from(literal: Literal) -> Self {
        Self::Literal(literal)
    }
}

impl core::convert::From<Operator> for Expression {
    fn from(op: Operator) -> Self {
        Self::Operator(op)
    }
}

impl core::convert::From<Operator> for Box<Expression> {
    fn from(value: Operator) -> Self {
        Box::new(value.into())
    }
}


================================================
FILE: src/sql/parser/lexer.rs
================================================
use std::fmt::Display;
use std::iter::Peekable;
use std::str::Chars;

use crate::errinput;
use crate::error::Result;

/// A lexical token.
///
/// These carry owned String clones rather than &str references into the
/// original input string, because the lexer may need to modify the string (e.g.
/// to parse escaped quotes in strings, lowercase identifiers, etc). We could
/// use `Cow<str>` to avoid this in the common case, but we'll end up using
/// owned strings in the final parsed AST anyway to avoid propagating these
/// lifetimes throughout the entire SQL execution engine, so we keep it simple.
#[derive(Clone, Debug, PartialEq)]
pub enum Token {
    /// A numeric string, with digits, decimal points, and/or exponents. Leading
    /// signs (e.g. -) are separate tokens.
    Number(String),
    /// A Unicode string, with quotes stripped and escape sequences resolved.
    String(String),
    /// An identifier, with any quotes stripped. Lowercased if not quoted.
    Ident(String),
    /// A SQL keyword.
    Keyword(Keyword),
    Period,             // .
    Equal,              // =
    NotEqual,           // !=
    GreaterThan,        // >
    GreaterThanOrEqual, // >=
    LessThan,           // <
    LessThanOrEqual,    // <=
    LessOrGreaterThan,  // <>
    Plus,               // +
    Minus,              // -
    Asterisk,           // *
    Slash,              // /
    Caret,              // ^
    Percent,            // %
    Exclamation,        // !
    Question,           // ?
    Comma,              // ,
    Semicolon,          // ;
    OpenParen,          // (
    CloseParen,         // )
}

impl Display for Token {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        f.write_str(match self {
            Self::Number(n) => n,
            Self::String(s) => s,
            Self::Ident(s) => s,
            Self::Keyword(k) => return k.fmt(f),
            Self::Period => ".",
            Self::Equal => "=",
            Self::NotEqual => "!=",
            Self::GreaterThan => ">",
            Self::GreaterThanOrEqual => ">=",
            Self::LessThan => "<",
            Self::LessThanOrEqual => "<=",
            Self::LessOrGreaterThan => "<>",
            Self::Plus => "+",
            Self::Minus => "-",
            Self::Asterisk => "*",
            Self::Slash => "/",
            Self::Caret => "^",
            Self::Percent => "%",
            Self::Exclamation => "!",
            Self::Question => "?",
            Self::Comma => ",",
            Self::Semicolon => ";",
            Self::OpenParen => "(",
            Self::CloseParen => ")",
        })
    }
}

impl From<Keyword> for Token {
    fn from(keyword: Keyword) -> Self {
        Self::Keyword(keyword)
    }
}

/// Reserved SQL keywords.
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum Keyword {
    And,
    As,
    Asc,
    Begin,
    Bool,
    Boolean,
    By,
    Commit,
    Create,
    Cross,
    Default,
    Delete,
    Desc,
    Double,
    Drop,
    Exists,
    Explain,
    False,
    Float,
    From,
    Group,
    Having,
    If,
    Index,
    Infinity,
    Inner,
    Insert,
    Int,
    Integer,
    Into,
    Is,
    Join,
    Key,
    Left,
    Like,
    Limit,
    NaN,
    Not,
    Null,
    Of,
    Offset,
    On,
    Only,
    Or,
    Order,
    Outer,
    Primary,
    Read,
    References,
    Right,
    Rollback,
    Select,
    Set,
    String,
    System,
    Table,
    Text,
    Time,
    Transaction,
    True,
    Unique,
    Update,
    Values,
    Varchar,
    Where,
    Write,
}

impl TryFrom<&str> for Keyword {
    // Use a cheap static error string. This just indicates it's not a keyword.
    type Error = &'static str;

    fn try_from(value: &str) -> std::result::Result<Self, Self::Error> {
        // Only compare lowercase, which is enforced by the lexer. This avoids
        // allocating a string to change the case. Assert this.
        debug_assert!(value.chars().all(|c| !c.is_uppercase()), "keyword must be lowercase");
        Ok(match value {
            "as" => Self::As,
            "asc" => Self::Asc,
            "and" => Self::And,
            "begin" => Self::Begin,
            "bool" => Self::Bool,
            "boolean" => Self::Boolean,
            "by" => Self::By,
            "commit" => Self::Commit,
            "create" => Self::Create,
            "cross" => Self::Cross,
            "default" => Self::Default,
            "delete" => Self::Delete,
            "desc" => Self::Desc,
            "double" => Self::Double,
            "drop" => Self::Drop,
            "exists" => Self::Exists,
            "explain" => Self::Explain,
            "false" => Self::False,
            "float" => Self::Float,
            "from" => Self::From,
            "group" => Self::Group,
            "having" => Self::Having,
            "if" => Self::If,
            "index" => Self::Index,
            "infinity" => Self::Infinity,
            "inner" => Self::Inner,
            "insert" => Self::Insert,
            "int" => Self::Int,
            "integer" => Self::Integer,
            "into" => Self::Into,
            "is" => Self::Is,
            "join" => Self::Join,
            "key" => Self::Key,
            "left" => Self::Left,
            "like" => Self::Like,
            "limit" => Self::Limit,
            "nan" => Self::NaN,
            "not" => Self::Not,
            "null" => Self::Null,
            "of" => Self::Of,
            "offset" => Self::Offset,
            "on" => Self::On,
            "only" => Self::Only,
            "or" => Self::Or,
            "order" => Self::Order,
            "outer" => Self::Outer,
            "primary" => Self::Primary,
            "read" => Self::Read,
            "references" => Self::References,
            "right" => Self::Right,
            "rollback" => Self::Rollback,
            "select" => Self::Select,
            "set" => Self::Set,
            "string" => Self::String,
            "system" => Self::System,
            "table" => Self::Table,
            "text" => Self::Text,
            "time" => Self::Time,
            "transaction" => Self::Transaction,
            "true" => Self::True,
            "unique" => Self::Unique,
            "update" => Self::Update,
            "values" => Self::Values,
            "varchar" => Self::Varchar,
            "where" => Self::Where,
            "write" => Self::Write,
            _ => return Err("not a keyword"),
        })
    }
}

impl Display for Keyword {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        // Display keywords as uppercase.
        f.write_str(match self {
            Self::As => "AS",
            Self::Asc => "ASC",
            Self::And => "AND",
            Self::Begin => "BEGIN",
            Self::Bool => "BOOL",
            Self::Boolean => "BOOLEAN",
            Self::By => "BY",
            Self::Commit => "COMMIT",
            Self::Create => "CREATE",
            Self::Cross => "CROSS",
            Self::Default => "DEFAULT",
            Self::Delete => "DELETE",
            Self::Desc => "DESC",
            Self::Double => "DOUBLE",
            Self::Drop => "DROP",
            Self::Exists => "EXISTS",
            Self::Explain => "EXPLAIN",
            Self::False => "FALSE",
            Self::Float => "FLOAT",
            Self::From => "FROM",
            Self::Group => "GROUP",
            Self::Having => "HAVING",
            Self::If => "IF",
            Self::Index => "INDEX",
            Self::Infinity => "INFINITY",
            Self::Inner => "INNER",
            Self::Insert => "INSERT",
            Self::Int => "INT",
            Self::Integer => "INTEGER",
            Self::Into => "INTO",
            Self::Is => "IS",
            Self::Join => "JOIN",
            Self::Key => "KEY",
            Self::Left => "LEFT",
            Self::Like => "LIKE",
            Self::Limit => "LIMIT",
            Self::NaN => "NAN",
            Self::Not => "NOT",
            Self::Null => "NULL",
            Self::Of => "OF",
            Self::Offset => "OFFSET",
            Self::On => "ON",
            Self::Only => "ONLY",
            Self::Outer => "OUTER",
            Self::Or => "OR",
            Self::Order => "ORDER",
            Self::Primary => "PRIMARY",
            Self::Read => "READ",
            Self::References => "REFERENCES",
            Self::Right => "RIGHT",
            Self::Rollback => "ROLLBACK",
            Self::Select => "SELECT",
            Self::Set => "SET",
            Self::String => "STRING",
            Self::System => "SYSTEM",
            Self::Table => "TABLE",
            Self::Text => "TEXT",
            Self::Time => "TIME",
            Self::Transaction => "TRANSACTION",
            Self::True => "TRUE",
            Self::Unique => "UNIQUE",
            Self::Update => "UPDATE",
            Self::Values => "VALUES",
            Self::Varchar => "VARCHAR",
            Self::Where => "WHERE",
            Self::Write => "WRITE",
        })
    }
}

/// The lexer (lexical analyzer) preprocesses raw SQL strings into a sequence of
/// lexical tokens (e.g. keyword, number, string, etc), which are passed on to
/// the SQL parser. In doing so, it strips away basic syntactic noise such as
/// whitespace, case, and quotes, and performs initial symbol validation.
pub struct Lexer<'a> {
    chars: Peekable<Chars<'a>>,
}

/// The lexer is used as a token iterator.
impl Iterator for Lexer<'_> {
    type Item = Result<Token>;

    fn next(&mut self) -> Option<Result<Token>> {
        match self.scan() {
            Ok(Some(token)) => Some(Ok(token)),
            // If there's any remaining chars, the lexer didn't recognize them.
            Ok(None) => self.chars.peek().map(|c| errinput!("unexpected character {c}")),
            Err(err) => Some(Err(err)),
        }
    }
}

impl<'a> Lexer<'a> {
    /// Creates a new lexer for the given string.
    pub fn new(input: &'a str) -> Lexer<'a> {
        Lexer { chars: input.chars().peekable() }
    }

    /// Returns the next character if it satisfies the predicate.
    fn next_if(&mut self, predicate: impl Fn(char) -> bool) -> Option<char> {
        self.chars.peek().filter(|&&c| predicate(c))?;
        self.chars.next()
    }

    /// Applies a closure to the next character, returning its result and
    /// consuming the next character if it's Some.
    fn next_if_map<T>(&mut self, map: impl Fn(char) -> Option<T>) -> Option<T> {
        let value = self.chars.peek().copied().and_then(map)?;
        self.chars.next();
        Some(value)
    }

    /// Returns true if the next character is the given character, consuming it.
    fn next_is(&mut self, c: char) -> bool {
        self.next_if(|n| n == c).is_some()
    }

    /// Scans the next token, if any.
    fn scan(&mut self) -> Result<Option<Token>> {
        // Ignore whitespace.
        self.skip_whitespace();
        let Some(c) = self.chars.peek() else {
            return Ok(None);
        };
        // The first character tells us the token kind. Scan it accordingly.
        match c {
            '\'' => self.scan_string(),
            '"' => self.scan_ident_quoted(),
            '0'..='9' => Ok(self.scan_number()),
            c if c.is_alphabetic() => Ok(self.scan_ident_or_keyword()),
            _ => Ok(self.scan_symbol()),
        }
    }

    /// Scans the next identifier or keyword, if any. It's converted to
    /// lowercase, by SQL convention.
    fn scan_ident_or_keyword(&mut self) -> Option<Token> {
        // The first character must be alphabetic. The rest can be numeric.
        let mut name = self.next_if(|c| c.is_alphabetic())?.to_lowercase().to_string();
        while let Some(c) = self.next_if(|c| c.is_alphanumeric() || c == '_') {
            name.extend(c.to_lowercase())
        }
        // Check if the identifier matches a keyword.
        if let Ok(keyword) = Keyword::try_from(name.as_str()) {
            return Some(Token::Keyword(keyword));
        }
        Some(Token::Ident(name))
    }

    /// Scans the next quoted identifier, if any. Case is preserved.
    fn scan_ident_quoted(&mut self) -> Result<Option<Token>> {
        if !self.next_is('"') {
            return Ok(None);
        }
        let mut ident = String::new();
        loop {
            match self.chars.next() {
                // "" is the escape sequence for ".
                Some('"') if self.next_is('"') => ident.push('"'),
                Some('"') => break,
                Some(c) => ident.push(c),
                None => return errinput!("unexpected end of quoted identifier"),
            }
        }
        Ok(Some(Token::Ident(ident)))
    }

    /// Scans the next number, if any.
    fn scan_number(&mut self) -> Option<Token> {
        // Scan the integer part. There must be at least one digit.
        let mut number = self.next_if(|c| c.is_ascii_digit())?.to_string();
        while let Some(c) = self.next_if(|c| c.is_ascii_digit()) {
            number.push(c)
        }
        // Scan the fractional part, if any.
        if self.next_is('.') {
            number.push('.');
            while let Some(dec) = self.next_if(|c| c.is_ascii_digit()) {
                number.push(dec)
            }
        }
        // Scan the exponent, if any.
        if let Some(exp) = self.next_if(|c| c == 'e' || c == 'E') {
            number.push(exp);
            if let Some(sign) = self.next_if(|c| c == '+' || c == '-') {
                number.push(sign)
            }
            while let Some(c) = self.next_if(|c| c.is_ascii_digit()) {
                number.push(c)
            }
        }
        Some(Token::Number(number))
    }

    /// Scans the next quoted string literal, if any.
    fn scan_string(&mut self) -> Result<Option<Token>> {
        if !self.next_is('\'') {
            return Ok(None);
        }
        let mut string = String::new();
        loop {
            match self.chars.next() {
                // '' is the escape sequence for '.
                Some('\'') if self.next_is('\'') => string.push('\''),
                Some('\'') => break,
                Some(c) => string.push(c),
                None => return errinput!("unexpected end of string literal"),
            }
        }
        Ok(Some(Token::String(string)))
    }

    /// Scans the next symbol token, if any.
    fn scan_symbol(&mut self) -> Option<Token> {
        let mut token = self.next_if_map(|c| {
            Some(match c {
                '.' => Token::Period,
                '=' => Token::Equal,
                '>' => Token::GreaterThan,
                '<' => Token::LessThan,
                '+' => Token::Plus,
                '-' => Token::Minus,
                '*' => Token::Asterisk,
                '/' => Token::Slash,
                '^' => Token::Caret,
                '%' => Token::Percent,
                '!' => Token::Exclamation,
                '?' => Token::Question,
                ',' => Token::Comma,
                ';' => Token::Semicolon,
                '(' => Token::OpenParen,
                ')' => Token::CloseParen,
                _ => return None,
            })
        })?;
        // Handle two-character tokens, e.g. !=.
        token = match token {
            Token::Exclamation if self.next_is('=') => Token::NotEqual,
            Token::GreaterThan if self.next_is('=') => Token::GreaterThanOrEqual,
            Token::LessThan if self.next_is('>') => Token::LessOrGreaterThan,
            Token::LessThan if self.next_is('=') => Token::LessThanOrEqual,
            token => token,
        };
        Some(token)
    }

    /// Skips any whitespace.
    fn skip_whitespace(&mut self) {
        while self.next_if(|c| c.is_whitespace()).is_some() {}
    }
}

/// Returns true if the entire given string is a single valid identifier.
pub fn is_ident(ident: &str) -> bool {
    let mut lexer = Lexer::new(ident);
    let Some(Ok(Token::Ident(_))) = lexer.next() else {
        return false;
    };
    lexer.next().is_none() // if further tokens, it's not a lone identifier
}


================================================
FILE: src/sql/parser/mod.rs
================================================
//! Parses raw SQL strings into a structured Abstract Syntax Tree.

pub mod ast;
mod lexer;
mod parser;

pub use lexer::{Keyword, Lexer, Token, is_ident};
pub use parser::Parser;


================================================
FILE: src/sql/parser/parser.rs
================================================
use std::iter::Peekable;
use std::ops::Add;

use super::{Keyword, Lexer, Token, ast};
use crate::errinput;
use crate::error::Result;
use crate::sql::types::DataType;

/// The SQL parser takes tokens from the lexer and parses the SQL syntax into an
/// Abstract Syntax Tree (AST).
///
/// The AST represents the syntactic structure of a SQL query (e.g. the SELECT
/// and FROM clauses, values, arithmetic expressions, etc.). However, it only
/// ensures the syntax is well-formed, and does not know whether e.g. a given
/// table or column exists or which kind of join to use -- that is the job of
/// the planner.
pub struct Parser<'a> {
    pub lexer: Peekable<Lexer<'a>>,
}

impl Parser<'_> {
    /// Parses the input string into a SQL statement AST. The entire string must
    /// be parsed as a single statement, ending with an optional semicolon.
    pub fn parse(statement: &str) -> Result<ast::Statement> {
        let mut parser = Self::new(statement);
        let statement = parser.parse_statement()?;
        parser.skip(Token::Semicolon);
        if let Some(token) = parser.lexer.next().transpose()? {
            return errinput!("unexpected token {token}");
        }
        Ok(statement)
    }

    /// Parse the input string into a SQL expression AST. The entire string must
    /// be parsed as a single expression. Only used in tests.
    #[cfg(test)]
    pub fn parse_expr(expr: &str) -> Result<ast::Expression> {
        let mut parser = Self::new(expr);
        let expression = parser.parse_expression()?;
        if let Some(token) = parser.lexer.next().transpose()? {
            return errinput!("unexpected token {token}");
        }
        Ok(expression)
    }

    /// Creates a new parser for the given raw SQL string.
    fn new(input: &str) -> Parser<'_> {
        Parser { lexer: Lexer::new(input).peekable() }
    }

    /// Fetches the next lexer token, or errors if none is found.
    fn next(&mut self) -> Result<Token> {
        self.lexer.next().transpose()?.ok_or_else(|| errinput!("unexpected end of input"))
    }

    /// Returns the next identifier, or errors if not found.
    fn next_ident(&mut self) -> Result<String> {
        match self.next()? {
            Token::Ident(ident) => Ok(ident),
            token => errinput!("expected identifier, got {token}"),
        }
    }

    /// Returns the next lexer token if it satisfies the predicate.
    fn next_if(&mut self, predicate: impl Fn(&Token) -> bool) -> Option<Token> {
        self.peek().ok()?.filter(|t| predicate(t))?;
        self.next().ok()
    }

    /// Passes the next lexer token through the closure, consuming it if the
    /// closure returns Some. Returns the result of the closure.
    fn next_if_map<T>(&mut self, f: impl Fn(&Token) -> Option<T>) -> Option<T> {
        self.peek().ok()?.map(f)?.inspect(|_| drop(self.next()))
    }

    /// Returns the next keyword if there is one.
    fn next_if_keyword(&mut self) -> Option<Keyword> {
        self.next_if_map(|token| match token {
            Token::Keyword(keyword) => Some(*keyword),
            _ => None,
        })
    }

    /// Consumes the next lexer token if it is the given token, returning true.
    fn next_is(&mut self, token: Token) -> bool {
        self.next_if(|t| t == &token).is_some()
    }

    /// Consumes the next lexer token if it's the expected token, or errors.
    fn expect(&mut self, expect: Token) -> Result<()> {
        let token = self.next()?;
        if token != expect {
            return errinput!("expected token {expect}, found {token}");
        }
        Ok(())
    }

    /// Consumes the next lexer token if it is the given token. Equivalent to
    /// next_is(), but expresses intent better.
    fn skip(&mut self, token: Token) {
        self.next_is(token);
    }

    /// Peeks the next lexer token if any, but transposes it for convenience.
    fn peek(&mut self) -> Result<Option<&Token>> {
        self.lexer.peek().map(|r| r.as_ref().map_err(|err| err.clone())).transpose()
    }

    /// Parses a SQL statement.
    fn parse_statement(&mut self) -> Result<ast::Statement> {
        let Some(token) = self.peek()? else {
            return errinput!("unexpected end of input");
        };
        match token {
            Token::Keyword(Keyword::Begin) => self.parse_begin(),
            Token::Keyword(Keyword::Commit) => self.parse_commit(),
            Token::Keyword(Keyword::Rollback) => self.parse_rollback(),
            Token::Keyword(Keyword::Explain) => self.parse_explain(),

            Token::Keyword(Keyword::Create) => self.parse_create_table(),
            Token::Keyword(Keyword::Drop) => self.parse_drop_table(),

            Token::Keyword(Keyword::Delete) => self.parse_delete(),
            Token::Keyword(Keyword::Insert) => self.parse_insert(),
            Token::Keyword(Keyword::Select) => self.parse_select(),
            Token::Keyword(Keyword::Update) => self.parse_update(),

            token => errinput!("unexpected token {token}"),
        }
    }

    /// Parses a BEGIN statement.
    fn parse_begin(&mut self) -> Result<ast::Statement> {
        self.expect(Keyword::Begin.into())?;
        self.skip(Keyword::Transaction.into());

        let mut read_only = false;
        if self.next_is(Keyword::Read.into()) {
            match self.next()? {
                Token::Keyword(Keyword::Only) => read_only = true,
                Token::Keyword(Keyword::Write) => {}
                token => return errinput!("unexpected token {token}"),
            }
        }

        let mut as_of = None;
        if self.next_is(Keyword::As.into()) {
            self.expect(Keyword::Of.into())?;
            self.expect(Keyword::System.into())?;
            self.expect(Keyword::Time.into())?;
            match self.next()? {
                Token::Number(n) => as_of = Some(n.parse()?),
                token => return errinput!("unexpected token {token}, wanted number"),
            }
        }
        Ok(ast::Statement::Begin { read_only, as_of })
    }

    /// Parses a COMMIT statement.
    fn parse_commit(&mut self) -> Result<ast::Statement> {
        self.expect(Keyword::Commit.into())?;
        Ok(ast::Statement::Commit)
    }

    /// Parses a ROLLBACK statement.
    fn parse_rollback(&mut self) -> Result<ast::Statement> {
        self.expect(Keyword::Rollback.into())?;
        Ok(ast::Statement::Rollback)
    }

    /// Parses an EXPLAIN statement.
    fn parse_explain(&mut self) -> Result<ast::Statement> {
        self.expect(Keyword::Explain.into())?;
        if self.next_is(Keyword::Explain.into()) {
            return errinput!("cannot nest EXPLAIN statements");
        }
        Ok(ast::Statement::Explain(Box::new(self.parse_statement()?)))
    }

    /// Parses a CREATE TABLE statement.
    fn parse_create_table(&mut self) -> Result<ast::Statement> {
        self.expect(Keyword::Create.into())?;
        self.expect(Keyword::Table.into())?;
        let name = self.next_ident()?;
        self.expect(Token::OpenParen)?;
        let mut columns = Vec::new();
        loop {
            columns.push(self.parse_create_table_column()?);
            if !self.next_is(Token::Comma) {
                break;
            }
        }
        self.expect(Token::CloseParen)?;
        Ok(ast::Statement::CreateTable { name, columns })
    }

    /// Parses a CREATE TABLE column definition.
    fn parse_create_table_column(&mut self) -> Result<ast::Column> {
        let name = self.next_ident()?;
        let datatype = match self.next()? {
            Token::Keyword(Keyword::Bool | Keyword::Boolean) => DataType::Boolean,
            Token::Keyword(Keyword::Float | Keyword::Double) => DataType::Float,
            Token::Keyword(Keyword::Int | Keyword::Integer) => DataType::Integer,
            Token::Keyword(Keyword::String | Keyword::Text | Keyword::Varchar) => DataType::String,
            token => return errinput!("unexpected token {token}"),
        };
        let mut column = ast::Column {
            name,
            datatype,
            primary_key: false,
            nullable: None,
            default: None,
            unique: false,
            index: false,
            references: None,
        };
        while let Some(keyword) = self.next_if_keyword() {
            match keyword {
                Keyword::Primary => {
                    self.expect(Keyword::Key.into())?;
                    column.primary_key = true;
                }
                Keyword::Null => {
                    if column.nullable.is_some() {
                        return errinput!("nullability already set for column {}", column.name);
                    }
                    column.nullable = Some(true)
                }
                Keyword::Not => {
                    self.expect(Keyword::Null.into())?;
                    if column.nullable.is_some() {
                        return errinput!("nullability already set for column {}", column.name);
                    }
                    column.nullable = Some(false)
                }
                Keyword::Default => column.default = Some(self.parse_expression()?),
                Keyword::Unique => column.unique = true,
                Keyword::Index => column.index = true,
                Keyword::References => column.references = Some(self.next_ident()?),
                keyword => return errinput!("unexpected keyword {keyword}"),
            }
        }
        Ok(column)
    }

    /// Parses a DROP TABLE statement.
    fn parse_drop_table(&mut self) -> Result<ast::Statement> {
        self.expect(Token::Keyword(Keyword::Drop))?;
        self.expect(Token::Keyword(Keyword::Table))?;
        let mut if_exists = false;
        if self.next_is(Keyword::If.into()) {
            self.expect(Token::Keyword(Keyword::Exists))?;
            if_exists = true;
        }
        let name = self.next_ident()?;
        Ok(ast::Statement::DropTable { name, if_exists })
    }

    /// Parses a DELETE statement.
    fn parse_delete(&mut self) -> Result<ast::Statement> {
        self.expect(Keyword::Delete.into())?;
        self.expect(Keyword::From.into())?;
        let table = self.next_ident()?;
        Ok(ast::Statement::Delete { table, r#where: self.parse_where_clause()? })
    }

    /// Parses an INSERT statement.
    fn parse_insert(&mut self) -> Result<ast::Statement> {
        self.expect(Keyword::Insert.into())?;
        self.expect(Keyword::Into.into())?;
        let table = self.next_ident()?;

        let mut columns = None;
        if self.next_is(Token::OpenParen) {
            let columns = columns.insert(Vec::new());
            loop {
                columns.push(self.next_ident()?);
                if !self.next_is(Token::Comma) {
                    break;
                }
            }
            self.expect(Token::CloseParen)?;
        }

        self.expect(Keyword::Values.into())?;

        let mut values = Vec::new();
        loop {
            let mut row = Vec::new();
            self.expect(Token::OpenParen)?;
            loop {
                row.push(self.parse_expression()?);
                if !self.next_is(Token::Comma) {
                    break;
                }
            }
            self.expect(Token::CloseParen)?;
            values.push(row);
            if !self.next_is(Token::Comma) {
                break;
            }
        }

        Ok(ast::Statement::Insert { table, columns, values })
    }

    /// Parses an UPDATE statement.
    fn parse_update(&mut self) -> Result<ast::Statement> {
        self.expect(Keyword::Update.into())?;
        let table = self.next_ident()?;
        self.expect(Keyword::Set.into())?;
        let mut set = std::collections::BTreeMap::new();
        loop {
            let column = self.next_ident()?;
            self.expect(Token::Equal)?;
            let expr = (!self.next_is(Keyword::Default.into()))
                .then(|| self.parse_expression())
                .transpose()?;
            if set.contains_key(&column) {
                return errinput!("column {column} set multiple times");
            }
            set.insert(column, expr);
            if !self.next_is(Token::Comma) {
                break;
            }
        }
        Ok(ast::Statement::Update { table, set, r#where: self.parse_where_clause()? })
    }

    /// Parses a SELECT statement.
    fn parse_select(&mut self) -> Result<ast::Statement> {
        Ok(ast::Statement::Select {
            select: self.parse_select_clause()?,
            from: self.parse_from_clause()?,
            r#where: self.parse_where_clause()?,
            group_by: self.parse_group_by_clause()?,
            having: self.parse_having_clause()?,
            order_by: self.parse_order_by_clause()?,
            limit: self.parse_limit_clause()?,
            offset: self.parse_offset_clause()?,
        })
    }

    /// Parses a SELECT clause, if present.
    fn parse_select_clause(&mut self) -> Result<Vec<(ast::Expression, Option<String>)>> {
        if !self.next_is(Keyword::Select.into()) {
            return Ok(Vec::new());
        }
        let mut select = Vec::new();
        loop {
            let expr = self.parse_expression()?;
            let mut alias = None;
            if self.next_is(Keyword::As.into()) || matches!(self.peek()?, Some(Token::Ident(_))) {
                if expr == ast::Expression::All {
                    return errinput!("can't alias *");
                }
                alias = Some(self.next_ident()?);
            }
            select.push((expr, alias));
            if !self.next_is(Token::Comma) {
                break;
            }
        }
        Ok(select)
    }

    /// Parses a FROM clause, if present.
    fn parse_from_clause(&mut self) -> Result<Vec<ast::From>> {
        if !self.next_is(Keyword::From.into()) {
            return Ok(Vec::new());
        }
        let mut from = Vec::new();
        loop {
            let mut from_item = self.parse_from_table()?;
            while let Some(r#type) = self.parse_from_join()? {
                let left = Box::new(from_item);
                let right = Box::new(self.parse_from_table()?);
                let mut predicate = None;
                if r#type != ast::JoinType::Cross {
                    self.expect(Keyword::On.into())?;
                    predicate = Some(self.parse_expression()?)
                }
                from_item = ast::From::Join { left, right, r#type, predicate };
            }
            from.push(from_item);
            if !self.next_is(Token::Comma) {
                break;
            }
        }
        Ok(from)
    }

    // Parses a FROM table.
    fn parse_from_table(&mut self) -> Result<ast::From> {
        let name = self.next_ident()?;
        let mut alias = None;
        if self.next_is(Keyword::As.into()) || matches!(self.peek()?, Some(Token::Ident(_))) {
            alias = Some(self.next_ident()?)
        };
        Ok(ast::From::Table { name, alias })
    }

    // Parses a FROM JOIN type, if present.
    fn parse_from_join(&mut self) -> Result<Option<ast::JoinType>> {
        if self.next_is(Keyword::Join.into()) {
            return Ok(Some(ast::JoinType::Inner));
        }
        if self.next_is(Keyword::Cross.into()) {
            self.expect(Keyword::Join.into())?;
            return Ok(Some(ast::JoinType::Cross));
        }
        if self.next_is(Keyword::Inner.into()) {
            self.expect(Keyword::Join.into())?;
            return Ok(Some(ast::JoinType::Inner));
        }
        if self.next_is(Keyword::Left.into()) {
            self.skip(Keyword::Outer.into());
            self.expect(Keyword::Join.into())?;
            return Ok(Some(ast::JoinType::Left));
        }
        if self.next_is(Keyword::Right.into()) {
            self.skip(Keyword::Outer.into());
            self.expect(Keyword::Join.into())?;
            return Ok(Some(ast::JoinType::Right));
        }
        Ok(None)
    }

    /// Parses a WHERE clause, if present.
    fn parse_where_clause(&mut self) -> Result<Option<ast::Expression>> {
        if !self.next_is(Keyword::Where.into()) {
            return Ok(None);
        }
        Ok(Some(self.parse_expression()?))
    }

    /// Parses a GROUP BY clause, if present.
    fn parse_group_by_clause(&mut self) -> Result<Vec<ast::Expression>> {
        if !self.next_is(Keyword::Group.into()) {
            return Ok(Vec::new());
        }
        let mut group_by = Vec::new();
        self.expect(Keyword::By.into())?;
        loop {
            group_by.push(self.parse_expression()?);
            if !self.next_is(Token::Comma) {
                break;
            }
        }
        Ok(group_by)
    }

    /// Parses a HAVING clause, if present.
    fn parse_having_clause(&mut self) -> Result<Option<ast::Expression>> {
        if !self.next_is(Keyword::Having.into()) {
            return Ok(None);
        }
        Ok(Some(self.parse_expression()?))
    }

    /// Parses an ORDER BY clause, if present.
    fn parse_order_by_clause(&mut self) -> Result<Vec<(ast::Expression, ast::Direction)>> {
        if !self.next_is(Keyword::Order.into()) {
            return Ok(Vec::new());
        }
        let mut order_by = Vec::new();
        self.expect(Keyword::By.into())?;
        loop {
            let expr = self.parse_expression()?;
            let order = self
                .next_if_map(|token| match token {
                    Token::Keyword(Keyword::Asc) => Some(ast::Direction::Ascending),
                    Token::Keyword(Keyword::Desc) => Some(ast::Direction::Descending),
                    _ => None,
                })
                .unwrap_or_default();
            order_by.push((expr, order));
            if !self.next_is(Token::Comma) {
                break;
            }
        }
        Ok(order_by)
    }

    /// Parses a LIMIT clause, if present.
    fn parse_limit_clause(&mut self) -> Result<Option<ast::Expression>> {
        if !self.next_is(Keyword::Limit.into()) {
            return Ok(None);
        }
        Ok(Some(self.parse_expression()?))
    }

    /// Parses an OFFSET clause, if present.
    fn parse_offset_clause(&mut self) -> Result<Option<ast::Expression>> {
        if !self.next_is(Keyword::Offset.into()) {
            return Ok(None);
        }
        Ok(Some(self.parse_expression()?))
    }

    /// Parses an expression using the precedence climbing algorithm. See:
    ///
    /// <https://en.wikipedia.org/wiki/Operator-precedence_parser#Precedence_climbing_method>
    /// <https://eli.thegreenplace.net/2012/08/02/parsing-expressions-by-precedence-climbing>
    ///
    /// Expressions are made up of two main entities:
    ///
    /// * Atoms: values, variables, functions, and parenthesized expressions.
    /// * Operators: performs operations on atoms and sub-expressions.
    ///   * Prefix operators: e.g. `-a` or `NOT a`.
    ///   * Infix operators: e.g. `a + b`  or `a AND b`.
    ///   * Postfix operators: e.g. `a!` or `a IS NULL`.
    ///
    /// During parsing, we have to respect the mathematical precedence and
    /// associativity of operators. Consider e.g.:
    ///
    /// 2 ^ 3 ^ 2 - 4 * 3
    ///
    /// By the rules of precedence and associativity, this expression should
    /// be interpreted as:
    ///
    /// (2 ^ (3 ^ 2)) - (4 * 3)
    ///
    /// Specifically, the exponentiation operator ^ is right-associative, so it
    /// should be 2 ^ (3 ^ 2) = 512, not (2 ^ 3) ^ 2 = 64. Similarly,
    /// exponentiation and multiplication have higher precedence than
    /// subtraction, so it should be (2 ^ 3 ^ 2) - (4 * 3) = 500, not
    /// 2 ^ 3 ^ (2 - 4) * 3 = -3.24.
    ///
    /// To use precedence climbing, we first need to specify the relative
    /// precedence of operators as a number, where 1 is the lowest precedence:
    ///
    /// * 1: OR
    /// * 2: AND
    /// * 3: NOT
    /// * 4: =, !=, LIKE, IS
    /// * 5: <, <=, >, >=
    /// * 6: +, -
    /// * 7: *, /, %
    /// * 8: ^
    /// * 9: !
    /// * 10: +, - (prefix)
    ///
    /// We also have to specify the associativity of operators:
    ///
    /// * Right-associative: ^ and all prefix operators.
    /// * Left-associative: all other operators.
    ///
    /// Left-associative operators get a +1 to their precedence, so that they
    /// bind tighter to their left operand than right-associative operators.
    ///
    /// The precedence climbing algorithm works by recursively parsing the
    /// left-hand side of an expression (including any prefix operators), any
    /// infix operators and recursive right-hand side expressions, and finally
    /// any postfix operators.
    ///
    /// The grouping is determined by where the right-hand side recursion
    /// terminates. The algorithm will greedily consume as many operators as
    /// possible, but only as long as their precedence is greater than or equal
    /// to the precedence of the previous operator (hence the name "climbing").
    /// When we find an operator with lower precedence, we return the current
    /// expression up the recursion stack and resume parsing the operator at a
    /// lower precedence.
    ///
    /// The precedence levels for the previous example are as follows:
    ///
    ///     -----          Precedence 9: ^ right-associativity
    /// ---------          Precedence 9: ^
    ///             -----  Precedence 7: *
    /// -----------------  Precedence 6: -
    /// 2 ^ 3 ^ 2 - 4 * 3
    ///
    /// Let's walk through the recursive parsing of this expression:
    ///
    /// parse_expression_at(prec=0)
    ///   lhs = parse_expression_atom() = 2
    ///   op = parse_infix_operator(prec=0) = ^ (prec=9)
    ///   rhs = parse_expression_at(prec=9)
    ///     lhs = parse_expression_atom() = 3
    ///     op = parse_infix_operator(prec=9) = ^ (prec=9)
    ///     rhs = parse_expression_at(prec=9)
    ///       lhs = parse_expression_atom() = 2
    ///       op = parse_infix_operator(prec=9) = None (reject - at prec=6)
    ///       return lhs = 2
    ///     lhs = (lhs op rhs) = (3 ^ 2)
    ///     op = parse_infix_operator(prec=9) = None (reject - at prec=6)
    ///     return lhs = (3 ^ 2)
    ///   lhs = (lhs op rhs) = (2 ^ (3 ^ 2))
    ///   op = parse_infix_operator(prec=0) = - (prec=6)
    ///   rhs = parse_expression_at(prec=6)
    ///     lhs = parse_expression_atom() = 4
    ///     op = parse_infix_operator(prec=6) = * (prec=7)
    ///     rhs = parse_expression_at(prec=7)
    ///       lhs = parse_expression_atom() = 3
    ///       op = parse_infix_operator(prec=7) = None (end of expression)
    ///       return lhs = 3
    ///     lhs = (lhs op rhs) = (4 * 3)
    ///     op = parse_infix_operator(prec=6) = None (end of expression)
    ///     return lhs = (4 * 3)
    ///   lhs = (lhs op rhs) = ((2 ^ (3 ^ 2)) - (4 * 3))
    ///   op = parse_infix_operator(prec=0) = None (end of expression)
    ///   return lhs = ((2 ^ (3 ^ 2)) - (4 * 3))
    fn parse_expression(&mut self) -> Result<ast::Expression> {
        self.parse_expression_at(0)
    }

    /// Parses an expression at the given minimum precedence.
    fn parse_expression_at(&mut self, min_precedence: Precedence) -> Result<ast::Expression> {
        // If the left-hand side is a prefix operator, recursively parse it and
        // its operand. Otherwise, parse the left-hand side as an atom.
        let mut lhs = if let Some(prefix) = self.parse_prefix_operator_at(min_precedence) {
            let next_precedence = prefix.precedence() + prefix.associativity();
            let rhs = self.parse_expression_at(next_precedence)?;
            prefix.into_expression(rhs)
        } else {
            self.parse_expression_atom()?
        };

        // Apply any postfix operators to the left-hand side.
        while let Some(postfix) = self.parse_postfix_operator_at(min_precedence)? {
            lhs = postfix.into_expression(lhs)
        }

        // Repeatedly apply any infix operators to the left-hand side as long as
        // their precedence is greater than or equal to the current minimum
        // precedence (i.e. that of the upstack operator).
        //
        // The right-hand side expression parsing will recursively apply any
        // infix operators at or above this operator's precedence to the
        // right-hand side.
        while let Some(infix) = self.parse_infix_operator_at(min_precedence) {
            let next_precedence = infix.precedence() + infix.associativity();
            let rhs = self.parse_expression_at(next_precedence)?;
            lhs = infix.into_expression(lhs, rhs);
        }

        // Apply any postfix operators after the binary operator. Consider e.g.
        // 1 + NULL IS NULL.
        while let Some(postfix) = self.parse_postfix_operator_at(min_precedence)? {
            lhs = postfix.into_expression(lhs)
        }

        Ok(lhs)
    }

    /// Parses an expression atom. This is either:
    ///
    /// * A literal value.
    /// * A column name.
    /// * A function call.
    /// * A parenthesized expression.
    fn parse_expression_atom(&mut self) -> Result<ast::Expression> {
        Ok(match self.next()? {
            // All columns.
            Token::Asterisk => ast::Expression::All,

            // Literal value.
            Token::Number(n) if n.chars().all(|c| c.is_ascii_digit()) => {
                ast::Literal::Integer(n.parse()?).into()
            }
            Token::Number(n) => ast::Literal::Float(n.parse()?).into(),
            Token::String(s) => ast::Literal::String(s).into(),
            Token::Keyword(Keyword::True) => ast::Literal::Boolean(true).into(),
            Token::Keyword(Keyword::False) => ast::Literal::Boolean(false).into(),
            Token::Keyword(Keyword::Infinity) => ast::Literal::Float(f64::INFINITY).into(),
            Token::Keyword(Keyword::NaN) => ast::Literal::Float(f64::NAN).into(),
            Token::Keyword(Keyword::Null) => ast::Literal::Null.into(),

            // Function call.
            Token::Ident(name) if self.next_is(Token::OpenParen) => {
                let mut args = Vec::new();
                while !self.next_is(Token::CloseParen) {
                    if !args.is_empty() {
                        self.expect(Token::Comma)?;
                    }
                    args.push(self.parse_expression()?);
                }
                ast::Expression::Function(name, args)
            }

            // Column name, either qualified as table.column or unqualified.
            Token::Ident(table) if self.next_is(Token::Period) => {
                ast::Expression::Column(Some(table), self.next_ident()?)
            }
            Token::Ident(column) => ast::Expression::Column(None, column),

            // Parenthesized expression.
            Token::OpenParen => {
                let expr = self.parse_expression()?;
                self.expect(Token::CloseParen)?;
                expr
            }

            token => return errinput!("expected expression atom, found {token}"),
        })
    }

    /// Parses a prefix operator, if there is one and its precedence is at least
    /// min_precedence.
    fn parse_prefix_operator_at(&mut self, min_precedence: Precedence) -> Option<PrefixOperator> {
        self.next_if_map(|token| {
            let operator = match token {
                Token::Keyword(Keyword::Not) => PrefixOperator::Not,
                Token::Minus => PrefixOperator::Minus,
                Token::Plus => PrefixOperator::Plus,
                _ => return None,
            };
            Some(operator).filter(|op| op.precedence() >= min_precedence)
        })
    }

    /// Parses an infix operator, if there is one and its precedence is at least
    /// min_precedence.
    fn parse_infix_operator_at(&mut self, min_precedence: Precedence) -> Option<InfixOperator> {
        self.next_if_map(|token| {
            let operator = match token {
                Token::Asterisk => InfixOperator::Multiply,
                Token::Caret => InfixOperator::Exponentiate,
                Token::Equal => InfixOperator::Equal,
                Token::GreaterThan => InfixOperator::GreaterThan,
                Token::GreaterThanOrEqual => InfixOperator::GreaterThanOrEqual,
                Token::Keyword(Keyword::And) => InfixOperator::And,
                Token::Keyword(Keyword::Like) => InfixOperator::Like,
                Token::Keyword(Keyword::Or) => InfixOperator::Or,
                Token::LessOrGreaterThan => InfixOperator::NotEqual,
                Token::LessThan => InfixOperator::LessThan,
                Token::LessThanOrEqual => InfixOperator::LessThanOrEqual,
                Token::Minus => InfixOperator::Subtract,
                Token::NotEqual => InfixOperator::NotEqual,
                Token::Percent => InfixOperator::Remainder,
                Token::Plus => InfixOperator::Add,
                Token::Slash => InfixOperator::Divide,
                _ => return None,
            };
            Some(operator).filter(|op| op.precedence() >= min_precedence)
        })
    }

    /// Parses a postfix operator, if there is one and its precedence is at
    /// least min_precedence.
    fn parse_postfix_operator_at(
        &mut self,
        min_precedence: Precedence,
    ) -> Result<Option<PostfixOperator>> {
        // Handle IS (NOT) NULL/NAN separately, since it's multiple tokens.
        if self.peek()? == Some(&Token::Keyword(Keyword::Is)) {
            // We can't consume tokens unless the precedence is satisfied, so we
            // assume IS NULL (they all have the same precedence).
            if PostfixOperator::Is(ast::Literal::Null).precedence() < min_precedence {
                return Ok(None);
            }
            self.expect(Keyword::Is.into())?;
            let not = self.next_is(Keyword::Not.into());
            let value = match self.next()? {
                Token::Keyword(Keyword::NaN) => ast::Literal::Float(f64::NAN),
                Token::Keyword(Keyword::Null) => ast::Literal::Null,
                token => return errinput!("unexpected token {token}"),
            };
            let operator = match not {
                false => PostfixOperator::Is(value),
                true => PostfixOperator::IsNot(value),
            };
            return Ok(Some(operator));
        }

        Ok(self.next_if_map(|token| {
            let operator = match token {
                Token::Exclamation => PostfixOperator::Factorial,
                _ => return None,
            };
            Some(operator).filter(|op| op.precedence() >= min_precedence)
        }))
    }
}

/// Operator precedence.
type Precedence = u8;

/// Operator associativity.
enum Associativity {
    Left,
    Right,
}

impl Add<Associativity> for Precedence {
    type Output = Self;

    fn add(self, rhs: Associativity) -> Self {
        // Left-associative operators have increased precedence, so they bind
        // tighter to their left-hand side.
        self + match rhs {
            Associativity::Left => 1,
            Associativity::Right => 0,
        }
    }
}

/// Prefix operators.
enum PrefixOperator {
    Minus, // -a
    Not,   // NOT a
    Plus,  // +a
}

impl PrefixOperator {
    /// The operator precedence.
    fn precedence(&self) -> Precedence {
        match self {
            Self::Not => 3,
            Self::Minus | Self::Plus => 10,
        }
    }

    // The operator associativity. Prefix operators are right-associative by
    // definition.
    fn associativity(&self) -> Associativity {
        Associativity::Right
    }

    /// Builds an AST expression for the operator.
    fn into_expression(self, rhs: ast::Expression) -> ast::Expression {
        let rhs = Box::new(rhs);
        match self {
            Self::Plus => ast::Operator::Identity(rhs).into(),
            Self::Minus => ast::Operator::Negate(rhs).into(),
            Self::Not => ast::Operator::Not(rhs).into(),
        }
    }
}

/// Infix operators.
enum InfixOperator {
    Add,                // a + b
    And,                // a AND b
    Divide,             // a / b
    Equal,              // a = b
    Exponentiate,       // a ^ b
    GreaterThan,        // a > b
    GreaterThanOrEqual, // a >= b
    LessThan,           // a < b
    LessThanOrEqual,    // a <= b
    Like,               // a LIKE b
    Multiply,           // a * b
    NotEqual,           // a != b
    Or,                 // a OR b
    Remainder,          // a % b
    Subtract,           // a - b
}

impl InfixOperator {
    /// The operator precedence.
    ///
    /// Mostly follows Postgres, except IS and LIKE having same precedence as =.
    /// This is similar to SQLite and MySQL.
    fn precedence(&self) -> Precedence {
        match self {
            Self::Or => 1,
            Self::And => 2,
            // Self::Not => 3
            Self::Equal | Self::NotEqual | Self::Like => 4, // also Self::Is
            Self::GreaterThan
            | Self::GreaterThanOrEqual
            | Self::LessThan
            | Self::LessThanOrEqual => 5,
            Self::Add | Self::Subtract => 6,
            Self::Multiply | Self::Divide | Self::Remainder => 7,
            Self::Exponentiate => 8,
        }
    }

    /// The operator associativity.
    fn associativity(&self) -> Associativity {
        match self {
            Self::Exponentiate => Associativity::Right,
            _ => Associativity::Left,
        }
    }

    /// Builds an AST expression for the infix operator.
    fn into_expression(self, lhs: ast::Expression, rhs: ast::Expression) -> ast::Expression {
        let (lhs, rhs) = (Box::new(lhs), Box::new(rhs));
        match self {
            Self::Add => ast::Operator::Add(lhs, rhs).into(),
            Self::And => ast::Operator::And(lhs, rhs).into(),
            Self::Divide => ast::Operator::Divide(lhs, rhs).into(),
            Self::Equal => ast::Operator::Equal(lhs, rhs).into(),
            Self::Exponentiate => ast::Operator::Exponentiate(lhs, rhs).into(),
            Self::GreaterThan => ast::Operator::GreaterThan(lhs, rhs).into(),
            Self::GreaterThanOrEqual => ast::Operator::GreaterThanOrEqual(lhs, rhs).into(),
            Self::LessThan => ast::Operator::LessThan(lhs, rhs).into(),
            Self::LessThanOrEqual => ast::Operator::LessThanOrEqual(lhs, rhs).into(),
            Self::Like => ast::Operator::Like(lhs, rhs).into(),
            Self::Multiply => ast::Operator::Multiply(lhs, rhs).into(),
            Self::NotEqual => ast::Operator::NotEqual(lhs, rhs).into(),
            Self::Or => ast::Operator::Or(lhs, rhs).into(),
            Self::Remainder => ast::Operator::Remainder(lhs, rhs).into(),
            Self::Subtract => ast::Operator::Subtract(lhs, rhs).into(),
        }
    }
}

/// Postfix operators.
enum PostfixOperator {
    Factorial,           // a!
    Is(ast::Literal),    // a IS NULL | NAN
    IsNot(ast::Literal), // a IS NOT NULL | NAN
}

impl PostfixOperator {
    // The operator precedence.
    fn precedence(&self) -> Precedence {
        match self {
            Self::Is(_) | Self::IsNot(_) => 4,
            Self::Factorial => 9,
        }
    }

    /// Builds an AST expression for the operator.
    fn into_expression(self, lhs: ast::Expression) -> ast::Expression {
        let lhs = Box::new(lhs);
        match self {
            Self::Factorial => ast::Operator::Factorial(lhs).into(),
            Self::Is(v) => ast::Operator::Is(lhs, v).into(),
            Self::IsNot(v) => ast::Operator::Not(ast::Operator::Is(lhs, v).into()).into(),
        }
    }
}


================================================
FILE: src/sql/planner/mod.rs
================================================
//! The planner builds and optimizes an execution plan based on a SQL
//! statement's Abstract Syntax Tree (AST) generated by the parser.

mod optimizer;
mod plan;
mod planner;

#[cfg(test)]
pub use optimizer::OPTIMIZERS;
pub use plan::{Aggregate, Direction, Node, Plan};
pub use planner::{Planner, Scope};


================================================
FILE: src/sql/planner/optimizer.rs
================================================
use std::collections::HashMap;
use std::fmt::Debug;
use std::sync::LazyLock;

use super::Node;
use crate::error::Result;
use crate::sql::types::{Expression, Label, Value};

/// The set of optimizers, and the order in which they are applied.
pub static OPTIMIZERS: LazyLock<Vec<Box<dyn Optimizer>>> = LazyLock::new(|| {
    vec![
        Box::new(ConstantFolding),
        Box::new(FilterPushdown),
        Box::new(IndexLookup),
        Box::new(HashJoin),
        Box::new(ShortCircuit),
    ]
});

/// A node optimizer, which recursively transforms a plan node to make plan
/// execution more efficient where possible.
pub trait Optimizer: Debug + Send + Sync {
    /// Optimizes a node, returning the optimized node.
    fn optimize(&self, node: Node) -> Result<Node>;
}

/// Folds constant expressions by pre-evaluating them once now, instead of
/// re-evaluating them for every row during execution.
#[derive(Debug)]
pub struct ConstantFolding;

impl Optimizer for ConstantFolding {
    fn optimize(&self, node: Node) -> Result<Node> {
        // Recursively transform expressions in the node tree. Post-order to
        // partially fold child expressions as far as possible, and avoid
        // quadratic costs.
        node.transform(&|node| node.transform_expressions(&Ok, &Self::fold), &Ok)
    }
}

impl ConstantFolding {
    /// Folds constant expressions in a node.
    pub fn fold(mut expr: Expression) -> Result<Expression> {
        use Expression::*;
        use Value::*;

        // If the expression is constant, evaluate it.
        //
        // This is a very simple approach, which doesn't handle more complex
        // cases such as 1 + a - 2 (which would require rearranging the
        // expression as 1 - 2 + a to evaluate the 1 - 2 branch).
        //
        // TODO: consider doing something better.
        if !expr.contains(&|expr| matches!(expr, Column(_))) {
            return expr.evaluate(None).map(Constant);
        }

        // If the expression is a logical operator, and one of the sides is
        // constant, we may be able to evaluate it even if it has a column
        // reference. For example, a AND FALSE is always FALSE, regardless of
        // what a is.
        expr = match expr {
            And(lhs, rhs) => match (*lhs, *rhs) {
                // If either side of an AND is false, the AND is false.
                (Constant(Boolean(false)), _) | (_, Constant(Boolean(false))) => {
                    Constant(Boolean(false))
                }
                // If either side of an AND is true, the AND is redundant.
                (Constant(Boolean(true)), expr) | (expr, Constant(Boolean(true))) => expr,
                (lhs, rhs) => And(lhs.into(), rhs.into()),
            },

            Or(lhs, rhs) => match (*lhs, *rhs) {
                // If either side of an OR is true, the OR is true.
                (Constant(Boolean(true)), _) | (_, Constant(Boolean(true))) => {
                    Constant(Boolean(true))
                }
                // If either side of an OR is false, the OR is redundant.
                (Constant(Boolean(false)), expr) | (expr, Constant(Boolean(false))) => expr,
                (lhs, rhs) => Or(lhs.into(), rhs.into()),
            },

            expr => expr,
        };

        Ok(expr)
    }
}

/// Pushes filter predicates down into child nodes where possible. In
/// particular, this can perform filtering during storage scans (below Raft),
/// instead of reading and transmitting all rows across the network before
/// filtering, by pushing a predicate from a Filter node down into a Scan node.
#[derive(Debug)]
pub struct FilterPushdown;

impl Optimizer for FilterPushdown {
    fn optimize(&self, node: Node) -> Result<Node> {
        // Push down before descending, so we can keep recursively pushing down.
        node.transform(&|node| Ok(Self::push_filters(node)), &Ok)
    }
}

impl FilterPushdown {
    /// Pushes filter predicates down into child nodes where possible.
    fn push_filters(mut node: Node) -> Node {
        node = Self::maybe_push_filter(node);
        node = Self::maybe_push_join(node);
        node
    }

    /// Pushes an expression into a node if possible. Otherwise, returns the the
    /// unpushed expression.
    fn push_into(expr: Expression, target: &mut Node) -> Option<Expression> {
        match target {
            Node::Filter { predicate, .. } => {
                // Temporarily replace the predicate to take ownership.
                let rhs = std::mem::replace(predicate, Expression::Constant(Value::Null));
                *predicate = Expression::And(expr.into(), rhs.into());
            }
            Node::NestedLoopJoin { predicate, .. } => {
                *predicate = match predicate.take() {
                    Some(predicate) => Some(Expression::And(expr.into(), predicate.into())),
                    None => Some(expr),
                };
            }
            Node::Scan { filter, .. } => {
                *filter = match filter.take() {
                    Some(filter) => Some(Expression::And(expr.into(), filter.into())),
                    None => Some(expr),
                };
            }
            // Unable to push down, just return the original expression.
            _ => return Some(expr),
        }
        None
    }

    /// Pushes a filter node predicate down into its source, if possible.
    fn maybe_push_filter(node: Node) -> Node {
        let Node::Filter { mut source, predicate } = node else {
            return node;
        };
        // Attempt to push the filter into the source, or return the original.
        if let Some(predicate) = Self::push_into(predicate, &mut source) {
            return Node::Filter { source, predicate };
        }
        // Push succeded, return the source that was pushed into. When we
        // replace this filter node with the source node, Node.transform() will
        // skip the source node since it now takes the place of the original
        // filter node. Transform the source manually.
        Self::push_filters(*source)
    }

    // Pushes down parts of a join predicate into the left or right sources
    // where possible.
    fn maybe_push_join(node: Node) -> Node {
        let Node::NestedLoopJoin { mut left, mut right, predicate: Some(predicate), outer } = node
        else {
            return node;
        };
        // Convert the predicate into conjunctive normal form (an AND vector).
        let cnf = predicate.into_cnf_vec();

        // Push down expressions that don't reference both sources. Constant
        // expressions can be pushed down into both.
        let (mut push_left, mut push_right, mut predicate) = (Vec::new(), Vec::new(), Vec::new());
        for expr in cnf {
            let (mut ref_left, mut ref_right) = (false, false);
            expr.walk(&mut |expr| {
                if let Expression::Column(index) = expr {
                    ref_left = ref_left || *index < left.columns();
                    ref_right = ref_right || *index >= left.columns();
                }
                !(ref_left && ref_right) // exit once both are referenced
            });
            match (ref_left, ref_right) {
                (true, true) => predicate.push(expr),
                (true, false) => push_left.push(expr),
                (false, true) => push_right.push(expr),
                (false, false) => {
                    push_left.push(expr.clone());
                    push_right.push(expr);
                }
            }
        }

        // In the remaining cross-source expressions, look for equijoins where
        // one side also has constant value lookups. In this case we can copy
        // the constant lookups to the other side, to allow index lookups. This
        // commonly happens when joining a foreign key (which is indexed) on a
        // primary key, and we want to make use of the foreign key index, e.g.:
        //
        // SELECT m.name, g.name FROM movies m JOIN genres g ON m.genre_id = g.id AND g.id = 7;
        let left_lookups: HashMap<usize, usize> = push_left // column → push_left index
            .iter()
            .enumerate()
            .filter_map(|(i, expr)| expr.is_column_lookup().map(|column| (column, i)))
            .collect();
        let right_lookups: HashMap<usize, usize> = push_right // column → push_right index
            .iter()
            .enumerate()
            .filter_map(|(i, expr)| expr.is_column_lookup().map(|column| (column, i)))
            .collect();

        for expr in &predicate {
            // Find equijoins.
            let Expression::Equal(lhs, rhs) = expr else { continue };
            let Expression::Column(mut l) = **lhs else { continue };
            let Expression::Column(mut r) = **rhs else { continue };

            // The lhs may be a reference to the right source; swap them.
            if l > r {
                (l, r) = (r, l)
            }

            // Check if either side is a column lookup, and copy it over.
            if let Some(expr) = left_lookups.get(&l).map(|i| push_left[*i].clone()) {
                push_right.push(expr.replace_column(l, r));
            }
            if let Some(expr) = right_lookups.get(&r).map(|i| push_right[*i].clone()) {
                push_left.push(expr.replace_column(r, l));
            }
        }

        // Push predicates down into the sources if possible.
        if let Some(expr) = Expression::and_vec(push_left)
            && let Some(expr) = Self::push_into(expr, &mut left)
        {
            // Pushdown failed, put it back into the join predicate.
            predicate.push(expr)
        }

        if let Some(mut expr) = Expression::and_vec(push_right) {
            // Right columns have indexes in the joined row; shift them left.
            expr = expr.shift_column(-(left.columns() as isize));
            if let Some(mut expr) = Self::push_into(expr, &mut right) {
                // Pushdown failed, undo the column index shift.
                expr = expr.shift_column(left.columns() as isize);
                predicate.push(expr)
            }
        }

        // Leave any remaining predicates in the join node.
        let predicate = Expression::and_vec(predicate);
        Node::NestedLoopJoin { left, right, predicate, outer }
    }
}

/// Uses a primary key or secondary index lookup where possible.
#[derive(Debug)]
pub struct IndexLookup;

impl Optimizer for IndexLookup {
    fn optimize(&self, node: Node) -> Result<Node> {
        // Recursively transform expressions in the node tree. Post-order to
        // partially fold child expressions as far as possible, and avoid
        // quadratic costs.
        node.transform(&|node| Ok(Self::index_lookup(node)), &Ok)
    }
}

impl IndexLookup {
    /// Rewrites a filtered scan node into a key or index lookup if possible.
    fn index_lookup(mut node: Node) -> Node {
        // Only handle scan filters. Assume FilterPushdown has pushed filters
        // into scan nodes first.
        let Node::Scan { table, alias, filter: Some(filter) } = node else {
            return node;
        };

        // Convert the filter into conjunctive normal form (a list of ANDs).
        let mut cnf = filter.clone().into_cnf_vec();

        // Find the first expression that's either a primary key or secondary
        // index lookup. We could be more clever here, but this is fine.
        let Some((i, column)) = cnf.iter().enumerate().find_map(|(i, expr)| {
            expr.is_column_lookup()
                .filter(|&c| c == table.primary_key || table.columns[c].index)
                .map(|column| (i, column))
        }) else {
            // No index lookups found, return the original node.
            return Node::Scan { table, alias, filter: Some(filter) };
        };

        // Extract the lookup values and expression from the cnf vector.
        let values = cnf.remove(i).into_column_values(column);

        // Build the primary key or secondary index lookup node.
        if column == table.primary_key {
            node = Node::KeyLookup { table, keys: values, alias };
        } else {
            node = Node::IndexLookup { table, column, values, alias };
        }

        // If there's any remaining CNF expressions, add a filter node for them.
        if let Some(predicate) = Expression::and_vec(cnf) {
            node = Node::Filter { source: Box::new(node), predicate };
        }

        node
    }
}

/// Uses a hash join instead of a nested loop join for single-column equijoins.
#[derive(Debug)]
pub struct HashJoin;

impl Optimizer for HashJoin {
    fn optimize(&self, node: Node) -> Result<Node> {
        node.transform(&|node| Ok(Self::hash_join(node)), &Ok)
    }
}

impl HashJoin {
    /// Rewrites a nested loop join into a hash join if possible.
    pub fn hash_join(node: Node) -> Node {
        let Node::NestedLoopJoin {
            left,
            right,
            predicate: Some(Expression::Equal(lhs, rhs)),
            outer,
        } = node
        else {
            return node;
        };

        match (*lhs, *rhs) {
            // If this is a single-column equijoin, use a hash join.
            (Expression::Column(mut left_column), Expression::Column(mut right_column)) => {
                // The LHS column may be a column in the right table; swap them.
                if right_column < left_column {
                    (left_column, right_column) = (right_column, left_column);
                }
                // The NestedLoopJoin predicate uses column indexes in the
                // joined row, while the HashJoin uses column indexes in each
                // individual table. Adjust the RHS column reference.
                right_column -= left.columns();
                Node::HashJoin { left, left_column, right, right_column, outer }
            }
            // Otherwise, retain the nested loop join.
            (lhs, rhs) => {
                let predicate = Some(Expression::Equal(lhs.into(), rhs.into()));
                Node::NestedLoopJoin { left, right, predicate, outer }
            }
        }
    }
}

/// Short-circuits useless nodes and expressions (for example a Filter node that
/// always evaluates to false), by removing them and/or replacing them with
/// Nothing nodes that yield no rows.
#[derive(Debug)]
pub struct ShortCircuit;

impl Optimizer for ShortCircuit {
    fn optimize(&self, node: Node) -> Result<Node> {
        // Post-order transform, to pull Nothing nodes upwards in the tree.
        node.transform(&Ok, &|node| Ok(Self::short_circuit(node)))
    }
}

impl ShortCircuit {
    /// Short-circuits useless nodes. Assumes the node has already been
    /// optimized by ConstantFolding.
    fn short_circuit(mut node: Node) -> Node {
        use Expression::*;
        use Value::*;

        node = match node {
            // Filter nodes that always yield true are unnecessary: remove them.
            Node::Filter { source, predicate: Constant(Boolean(true)) } => *source,

            // Predicates that always yield true are unnecessary: remove them.
            Node::Scan { table, filter: Some(Constant(Boolean(true))), alias } => {
                Node::Scan { table, filter: None, alias }
            }
            Node::NestedLoopJoin {
                left,
                right,
                predicate: Some(Constant(Boolean(true))),
                outer,
            } => Node::NestedLoopJoin { left, right, predicate: None, outer },

            // Remove noop projections that simply pass through the source columns.
            Node::Projection { source, expressions, aliases }
                if source.columns() == expressions.len()
                    && aliases.iter().all(|alias| *alias == Label::None)
                    && expressions
                        .iter()
                        .enumerate()
                        .all(|(i, expr)| *expr == Expression::Column(i)) =>
            {
                *source
            }

            node => node,
        };

        // Short-circuit nodes that don't produce anything by replacing them
        // with a Nothing node.
        let is_empty = match &node {
            Node::Filter { predicate: Constant(Boolean(false) | Null), .. } => true,
            Node::IndexLookup { values, .. } if values.is_empty() => true,
            Node::KeyLookup { keys, .. } if keys.is_empty() => true,
            Node::Limit { limit: 0, .. } => true,
            Node::NestedLoopJoin { predicate: Some(Constant(Boolean(false) | Null)), .. } => true,
            Node::Scan { filter: Some(Constant(Boolean(false) | Null)), .. } => true,
            Node::Values { rows } if rows.is_empty() => true,

            // Nodes that pull from a Nothing node can't produce anything.
            //
            // NB: does not short-circuit aggregation, since an aggregation over 0
            // rows should produce a result.
            Node::Filter { source, .. }
            | Node::HashJoin { left: source, .. }
            | Node::HashJoin { right: source, .. }
            | Node::NestedLoopJoin { left: source, .. }
            | Node::NestedLoopJoin { right: source, .. }
            | Node::Offset { source, .. }
            | Node::Order { source, .. }
            | Node::Projection { source, .. }
                if matches!(**source, Node::Nothing { .. }) =>
            {
                true
            }

            _ => false,
        };

        if is_empty {
            let columns = (0..node.columns()).map(|i| node.column_label(i)).collect();
            return Node::Nothing { columns };
        }

        node
    }
}


================================================
FILE: src/sql/planner/plan.rs
================================================
use std::collections::HashMap;
use std::fmt::Display;

use itertools::Itertools as _;
use serde::{Deserialize, Serialize};

use super::optimizer::OPTIMIZERS;
use super::planner::Planner;
use crate::error::Result;
use crate::sql::engine::{Catalog, Transaction};
use crate::sql::execution::{ExecutionResult, Executor};
use crate::sql::parser::ast;
use crate::sql::types::{Expression, Label, Table, Value};

/// A statement execution plan.
///
/// The plan root specifies the action to take (e.g. SELECT, INSERT, UPDATE,
/// etc). It has a nested tree of child nodes that stream an process rows.
///
/// Below is an example of an (unoptimized) query plan:
///
/// SELECT title, released, genres.name AS genre
/// FROM movies INNER JOIN genres ON movies.genre_id = genres.id
/// WHERE released >= 2000
/// ORDER BY released
///
/// Select
/// └─ Order: movies.released desc
///    └─ Projection: movies.title, movies.released, genres.name as genre
///       └─ Filter: movies.released >= 2000
///          └─ NestedLoopJoin: inner on movies.genre_id = genres.id
///             ├─ Scan: movies
///             └─ Scan: genres
///
/// Rows flow from the tree leaves to the root:
///
/// 1. Scan nodes read rows from movies and genres.
/// 2. NestedLoopJoin joins the rows from movies and genres.
/// 3. Filter discards rows with release dates older than 2000.
/// 4. Projection picks out the requested column values from the rows.
/// 5. Order sorts the rows by release date.
/// 6. Select returns the final rows to the client.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum Plan {
    /// A CREATE TABLE plan. Creates a new table with the given schema. Errors
    /// if the table already exists or the schema is invalid.
    CreateTable { schema: Table },

    /// A DROP TABLE plan. Drops the given table. Errors if the table does not
    /// exist, unless if_exists is true.
    DropTable { name: String, if_exists: bool },

    /// A DELETE plan. Deletes rows in table that match the rows from source.
    /// primary_key specifies the primary key column index in the source rows.
    Delete { table: String, primary_key: usize, source: Node },

    /// An INSERT plan. Inserts rows from source (typically a Values node) into
    /// table. If column_map is given, it maps table → source column indexes and
    /// must have one entry for every column in source. Table columns not
    /// present in source will get the column's default value if set, or error.
    Insert { table: Table, column_map: Option<HashMap<usize, usize>>, source: Node },

    /// An UPDATE plan. Updates rows in table that match the rows from source,
    /// where primary_key specifies the primary key column index in the source
    /// rows. The given column/expression pairs specify the row updates to make,
    /// evaluated using the existing source row, which must be a complete row
    /// from the update table.
    Update { table: Table, primary_key: usize, source: Node, expressions: Vec<(usize, Expression)> },

    /// A SELECT plan. Recursively executes the query plan tree and returns the
    /// resulting rows.
    Select(Node),
}

impl Plan {
    /// Builds a plan from an AST statement.
    pub fn build(statement: ast::Statement, catalog: &impl Catalog) -> Result<Self> {
        Planner::new(catalog).build(statement)
    }

    /// Executes the plan, consuming it.
    pub fn execute(self, txn: &impl Transaction) -> Result<ExecutionResult> {
        Executor::new(txn).execute(self)
    }

    /// Optimizes the plan, consuming it. See OPTIMIZERS for the list of
    /// optimizers.
    pub fn optimize(self) -> Result<Self> {
        let optimize = |node| OPTIMIZERS.iter().try_fold(node, |node, opt| opt.optimize(node));
        Ok(match self {
            Self::CreateTable { .. } | Self::DropTable { .. } => self,
            Self::Delete { table, primary_key, source } => {
                Self::Delete { table, primary_key, source: optimize(source)? }
            }
            Self::Insert { table, column_map, source } => {
                Self::Insert { table, column_map, source: optimize(source)? }
            }
            Self::Update { table, primary_key, source, expressions } => {
                Self::Update { table, primary_key, source: optimize(source)?, expressions }
            }
            Self::Select(root) => Self::Select(optimize(root)?),
        })
    }
}

/// A query plan node. Returns a row iterator, and can be nested.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum Node {
    /// Aggregates values for the given group_by buckets, across all rows in the
    /// source node. The group_by columns are emitted first, followed by the
    /// aggregate columns, in the given order.
    Aggregate { source: Box<Node>, group_by: Vec<Expression>, aggregates: Vec<Aggregate> },

    /// Filters source rows, by discarding rows for which the predicate
    /// evaluates to false.
    Filter { source: Box<Node>, predicate: Expression },

    /// Joins the left and right sources on the given columns by building an
    /// in-memory hashmap of the right source and looking up matches for each
    /// row in the left source. When outer is true (e.g. LEFT JOIN), a left row
    /// without a right match is emitted anyway, with NULLs for the right row.
    HashJoin {
        left: Box<Node>,
        left_column: usize,
        right: Box<Node>,
        right_column: usize,
        outer: bool,
    },

    /// Looks up the given values in a secondary index and emits matching rows.
    /// NULL and NaN values are considered equal, to allow IS NULL and IS NAN
    /// index lookups, as is -0.0 and 0.0.
    IndexLookup { table: Table, column: usize, values: Vec<Value>, alias: Option<String> },

    /// Looks up the given primary keys and emits their rows.
    KeyLookup { table: Table, keys: Vec<Value>, alias: Option<String> },

    /// Only emits the first limit rows from the source, discards the rest.
    Limit { source: Box<Node>, limit: usize },

    /// Joins the left and right sources on the given predicate by buffering the
    /// right source and iterating over it for every row in the left source.
    /// When outer is true (e.g. LEFT JOIN), a left row without a right match is
    /// emitted anyway, with NULLs for the right row.
    NestedLoopJoin { left: Box<Node>, right: Box<Node>, predicate: Option<Expression>, outer: bool },

    /// Nothing does not emit anything, and is used to short-circuit nodes that
    /// can't emit anything during optimization. It retains the column names of
    /// any replaced nodes for results headers and plan formatting.
    Nothing { columns: Vec<Label> },

    /// Discards the first offset rows from source, emits the rest.
    Offset { source: Box<Node>, offset: usize },

    /// Sorts the source rows by the given sort key. Buffers the entire row set
    /// in memory.
    Order { source: Box<Node>, key: Vec<(Expression, Direction)> },

    /// Projects the input rows by evaluating the given expressions. Aliases are
    /// only used when displaying the plan.
    Projection { source: Box<Node>, expressions: Vec<Expression>, aliases: Vec<Label> },

    /// Remaps source columns to the given target column index, or None to drop
    /// the column. Unspecified target columns yield Value::Null. The source →
    /// target mapping ensures a source column can only be mapped to a single
    /// target column, allowing the value to be moved rather than cloned.
    Remap { source: Box<Node>, targets: Vec<Option<usize>> },

    /// A full table scan, with an optional pushed-down filter. The schema is
    /// used during plan optimization. The alias is only used for formatting.
    Scan { table: Table, filter: Option<Expression>, alias: Option<String> },

    /// A constant set of values.
    Values { rows: Vec<Vec<Expression>> },
}

impl Node {
    /// Returns the number of columns emitted by the node.
    pub fn columns(&self) -> usize {
        match self {
            // Source nodes emit all table columns.
            Self::IndexLookup { table, .. }
            | Self::KeyLookup { table, .. }
            | Self::Scan { table, .. } => table.columns.len(),

            // These nodes modify the set of columns.
            Self::Aggregate { aggregates, group_by, .. } => aggregates.len() + group_by.len(),
            Self::Projection { expressions, .. } => expressions.len(),
            Self::Remap { targets, .. } => {
                targets.iter().copied().flatten().map(|i| i + 1).max().unwrap_or(0)
            }

            // Join nodes emit the combined columns.
            Self::HashJoin { left, right, .. } | Self::NestedLoopJoin { left, right, .. } => {
                left.columns() + right.columns()
            }

            // Constant nodes have a predefined number of columns.
            Self::Nothing { columns } => columns.len(),
            Self::Values { rows } => rows.first().map(|row| row.len()).unwrap_or(0),

            // Simple nodes just pass through the source columns.
            Self::Filter { source, .. }
            | Self::Limit { source, .. }
            | Self::Offset { source, .. }
            | Self::Order { source, .. } => source.columns(),
        }
    }

    /// Returns a label for a column, if any, by tracing the column through the
    /// plan tree. Only used for query result headers and plan display purposes,
    /// not to look up expression columns (see Scope).
    pub fn column_label(&self, index: usize) -> Label {
        match self {
            // Source nodes use the table/column name.
            Self::IndexLookup { table, alias, .. }
            | Self::KeyLookup { table, alias, .. }
            | Self::Scan { table, alias, .. } => Label::Qualified(
                alias.as_ref().unwrap_or(&table.name).clone(),
                table.columns[index].name.clone(),
            ),

            // These nodes rearrange columns. Route them to the correct upstream
            // column where appropriate.
            Self::Aggregate { source, group_by, .. } => match group_by.get(index) {
                Some(Expression::Column(index)) => source.column_label(*index),
                Some(_) | None => Label::None,
            },
            Self::Projection { source, expressions, aliases } => match aliases.get(index) {
                Some(Label::None) | None => match expressions.get(index) {
                    // Unaliased column references route to the source.
                    Some(Expression::Column(index)) => source.column_label(*index),
                    // Unaliased expressions don't have a name.
                    Some(_) | None => Label::None,
                },
                // Aliased columns use the alias.
                Some(alias) => alias.clone(),
            },
            Self::Remap { source, targets } => targets
                .iter()
                .copied()
                .position(|t| t == Some(index))
                .map(|i| source.column_label(i))
                .unwrap_or(Label::None),

            // Joins dispatch to the appropriate source.
            Self::HashJoin { left, right, .. } | Self::NestedLoopJoin { left, right, .. } => {
                if index < left.columns() {
                    left.column_label(index)
                } else {
                    right.column_label(index - left.columns())
                }
            }

            // Simple nodes just dispatch to the source.
            Self::Filter { source, .. }
            | Self::Limit { source, .. }
            | Self::Offset { source, .. }
            | Self::Order { source, .. } => source.column_label(index),

            // Nothing nodes contain the original columns of replaced nodes.
            Self::Nothing { columns } => columns.get(index).cloned().unwrap_or(Label::None),

            // And some don't have any names at all.
            Self::Values { .. } => Label::None,
        }
    }

    /// Recursively transforms query nodes depth-first by applying the given
    /// closures before and after descending.
    pub fn transform(
        mut self,
        before: &impl Fn(Self) -> Result<Self>,
        after: &impl Fn(Self) -> Result<Self>,
    ) -> Result<Self> {
        // Helper for transforming boxed nodes.
        let xform = |mut node: Box<Node>| -> Result<Box<Node>> {
            *node = node.transform(before, after)?;
            Ok(node)
        };

        self = before(self)?;
        self = match self {
            Self::Aggregate { source, group_by, aggregates } => {
                Self::Aggregate { source: xform(source)?, group_by, aggregates }
            }
            Self::Filter { source, predicate } => {
                Self::Filter { source: xform(source)?, predicate }
            }
            Self::HashJoin { left, left_column, right, right_column, outer } => Self::HashJoin {
                left: xform(left)?,
                left_column,
                right: xform(right)?,
                right_column,
                outer,
            },
            Self::Limit { source, limit } => Self::Limit { source: xform(source)?, limit },
            Self::NestedLoopJoin { left, right, predicate, outer } => {
                Self::NestedLoopJoin { left: xform(left)?, right: xform(right)?, predicate, outer }
            }
            Self::Offset { source, offset } => Self::Offset { source: xform(source)?, offset },
            Self::Order { source, key } => Self::Order { source: xform(source)?, key },
            Self::Projection { source, expressions, aliases } => {
                Self::Projection { source: xform(source)?, expressions, aliases }
            }
            Self::Remap { source, targets } => Self::Remap { source: xform(source)?, targets },

            Self::IndexLookup { .. }
            | Self::KeyLookup { .. }
            | Self::Nothing { .. }
            | Self::Scan { .. }
            | Self::Values { .. } => self,
        };
        self = after(self)?;
        Ok(self)
    }

    /// Recursively transforms all node expressions by calling the given
    /// closures on them before and after descending.
    pub fn transform_expressions(
        self,
        before: &impl Fn(Expression) -> Result<Expression>,
        after: &impl Fn(Expression) -> Result<Expression>,
    ) -> Result<Self> {
        Ok(match self {
            Self::Filter { source, mut predicate } => {
                predicate = predicate.transform(before, after)?;
                Self::Filter { source, predicate }
            }
            Self::NestedLoopJoin { left, right, predicate: Some(predicate), outer } => {
                let predicate = Some(predicate.transform(before, after)?);
                Self::NestedLoopJoin { left, right, predicate, outer }
            }
            Self::Order { source, mut key } => {
                key = key
                    .into_iter()
                    .map(|(expr, dir)| expr.transform(before, after).map(|expr| (expr, dir)))
                    .try_collect()?;
                Self::Order { source, key }
            }
            Self::Projection { source, mut expressions, aliases } => {
                expressions = expressions
                    .into_iter()
                    .map(|expr| expr.transform(before, after))
                    .try_collect()?;
                Self::Projection { source, expressions, aliases }
            }
            Self::Scan { table, alias, filter: Some(filter) } => {
                let filter = Some(filter.transform(before, after)?);
                Self::Scan { table, alias, filter }
            }
            Self::Values { mut rows } => {
                rows = rows
                    .into_iter()
                    .map(|row| row.into_iter().map(|expr| expr.transform(before, after)).collect())
                    .try_collect()?;
                Self::Values { rows }
            }

            Self::Aggregate { .. }
            | Self::HashJoin { .. }
            | Self::IndexLookup { .. }
            | Self::KeyLookup { .. }
            | Self::Limit { .. }
            | Self::NestedLoopJoin { predicate: None, .. }
            | Self::Nothing { .. }
            | Self::Offset { .. }
            | Self::Remap { .. }
            | Self::Scan { filter: None, .. } => self,
        })
    }
}

/// An aggregate function.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum Aggregate {
    Average(Expression),
    Count(Expression),
    Max(Expression),
    Min(Expression),
    Sum(Expression),
}

impl Aggregate {
    fn format(&self, node: &Node) -> String {
        match self {
            Self::Average(expr) => format!("avg({})", expr.display(node)),
            Self::Count(expr) => format!("count({})", expr.display(node)),
            Self::Max(expr) => format!("max({})", expr.display(node)),
            Self::Min(expr) => format!("min({})", expr.display(node)),
            Self::Sum(expr) => format!("sum({})", expr.display(node)),
        }
    }

    /// Returns the inner expression.
    pub fn expr(&self) -> &Expression {
        match self {
            Self::Average(expr)
            | Self::Count(expr)
            | Self::Max(expr)
            | Self::Min(expr)
            | Self::Sum(expr) => expr,
        }
    }
}

/// A sort order direction.
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
pub enum Direction {
    Ascending,
    Descending,
}

impl Display for Direction {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Ascending => f.write_str("asc"),
            Self::Descending => f.write_str("desc"),
        }
    }
}

impl From<ast::Direction> for Direction {
    fn from(dir: ast::Direction) -> Self {
        match dir {
            ast::Direction::Ascending => Self::Ascending,
            ast::Direction::Descending => Self::Descending,
        }
    }
}

/// Formats the plan as an EXPLAIN tree.
impl Display for Plan {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::CreateTable { schema } => write!(f, "CreateTable: {}", schema.name),
            Self::DropTable { name: table, .. } => write!(f, "DropTable: {table}"),
            Self::Delete { table, source, .. } => {
                write!(f, "Delete: {table}")?;
                source.format(f, "", false, true)
            }
            Self::Insert { table, source, .. } => {
                write!(f, "Insert: {}", table.name)?;
                source.format(f, "", false, true)
            }
            Self::Update { table, source, expressions, .. } => {
                let expressions = expressions
                    .iter()
                    .map(|(i, expr)| format!("{}={}", table.columns[*i].name, expr.display(source)))
                    .join(", ");
                write!(f, "Update: {} ({expressions})", table.name)?;
                source.format(f, "", false, true)
            }
            Self::Select(root) => root.format(f, "", true, true),
        }
    }
}

impl Display for Node {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        self.format(f, "", true, true)
    }
}

impl Node {
    /// Recursively formats the node. Prefix is used for tree branch lines. root
    /// is true if this is the root (first) node, and last_child is true if this
    /// is the last child node of the parent.
    pub fn format(
        &self,
        f: &mut std::fmt::Formatter<'_>,
        prefix: &str,
        root: bool,
        last_child: bool,
    ) -> std::fmt::Result {
        // If this is not the root node, emit a newline after the previous node.
        // This avoids a spurious newline at the end of the plan.
        if !root {
            writeln!(f)?;
        }

        // Prefix the node with a tree branch line. Modify the prefix for any
        // child nodes we'll recurse into.
        let prefix = if !last_child {
            write!(f, "{prefix}├─ ")?;
            format!("{prefix}│  ")
        } else if !root {
            write!(f, "{prefix}└─ ")?;
            format!("{prefix}   ")
        } else {
            write!(f, "{prefix}")?;
            prefix.to_string()
        };

        // Format the node.
        match self {
            Self::Aggregate { source, aggregates, group_by } => {
                let aggregates = group_by
                    .iter()
                    .map(|group_by| group_by.display(source).to_string())
                    .chain(aggregates.iter().map(|agg| agg.format(source)))
                    .join(", ");
                write!(f, "Aggregate: {aggregates}")?;
                source.format(f, &prefix, false, true)?;
            }

            Self::Filter { source, predicate } => {
                write!(f, "Filter: {}", predicate.display(source))?;
                source.format(f, &prefix, false, true)?;
            }

            Self::HashJoin { left, left_column, right, right_column, outer } => {
                let kind = if *outer { "outer" } else { "inner" };
                let left_column = match left.column_label(*left_column) {
                    Label::None => format!("left #{left_column}"),
                    label => format!("{label}"),
                };
                let right_column = match right.column_label(*right_column) {
                    Label::None => format!("right #{right_column}"),
                    label => format!("{label}"),
                };
                write!(f, "HashJoin: {kind} on {left_column} = {right_column}")?;
                left.format(f, &prefix, false, false)?;
                right.format(f, &prefix, false, true)?;
            }

            Self::IndexLookup { table, column, alias, values } => {
                let column = &table.columns[*column].name;
                write!(f, "IndexLookup: {}.{column}", table.name)?;
                if let Some(alias) = alias {
                    write!(f, " as {alias}.{column}")?;
                }
                if !values.is_empty() && values.len() < 10 {
                    write!(f, " ({})", values.iter().join(", "))?;
                } else {
                    write!(f, " ({} values)", values.len())?;
                }
            }

            Self::KeyLookup { table, alias, keys } => {
                write!(f, "KeyLookup: {}", table.name)?;
                if let Some(alias) = alias {
                    write!(f, " as {alias}")?;
                }
                if !keys.is_empty() && keys.len() < 10 {
                    write!(f, " ({})", keys.iter().join(", "))?;
                } else {
                    write!(f, " ({} keys)", keys.len())?;
                }
            }

            Self::Limit { source, limit } => {
                write!(f, "Limit: {limit}")?;
                source.format(f, &prefix, false, true)?;
            }

            Self::NestedLoopJoin { left, right, predicate, outer, .. } => {
                let kind = if *outer { "outer" } else { "inner" };
                write!(f, "NestedLoopJoin: {kind}")?;
                if let Some(predicate) = predicate {
                    write!(f, " on {}", predicate.display(self))?;
                }
                left.format(f, &prefix, false, false)?;
                right.format(f, &prefix, false, true)?;
            }

            Self::Nothing { .. } => write!(f, "Nothing")?,

            Self::Offset { source, offset } => {
                write!(f, "Offset: {offset}")?;
                source.format(f, &prefix, false, true)?;
            }

            Self::Order { source, key: orders } => {
                let orders = orders
                    .iter()
                    .map(|(expr, dir)| format!("{} {dir}", expr.display(source)))
                    .join(", ");
                write!(f, "Order: {orders}")?;
                source.format(f, &prefix, false, true)?;
            }

            Self::Projection { source, expressions, aliases } => {
                let expressions = expressions
                    .iter()
                    .enumerate()
                    .map(|(i, expr)| match aliases.get(i) {
                        Some(Label::None) | None => expr.display(source).to_string(),
                        Some(alias) => format!("{} as {alias}", expr.display(source)),
                    })
                    .join(", ");
                write!(f, "Projection: {expressions}")?;
                source.format(f, &prefix, false, true)?;
            }

            Self::Remap { source, targets } => {
                let remap = invert_remap(targets)
                    .into_iter()
                    .map(|from| match from {
                        Some(from) => match source.column_label(from) {
                            Label::None => format!("#{from}"),
                            label => label.to_string(),
                        },
                        None => "Null".to_string(),
                    })
                    .join(", ");
                write!(f, "Remap: {remap}")?;
                let dropped = targets
                    .iter()
                    .enumerate()
                    .filter_map(|(i, v)| {
                        v.is_none().then_some(match source.column_label(i) {
                            Label::None => format!("#{i}"),
                            label => format!("{label}"),
                        })
                    })
                    .join(", ");
                if !dropped.is_empty() {
                    write!(f, " (dropped: {dropped})")?;
                }
                source.format(f, &prefix, false, true)?;
            }

            Self::Scan { table, alias, filter } => {
                write!(f, "Scan: {}", table.name)?;
                if let Some(alias) = alias {
                    write!(f, " as {alias}")?;
                }
                if let Some(filter) = filter {
                    write!(f, " ({})", filter.display(self))?;
                }
            }

            Self::Values { rows, .. } => {
                write!(f, "Values: ")?;
                match rows.len() {
                    1 if rows[0].is_empty() => write!(f, "blank row")?,
                    1 => write!(f, "{}", rows[0].iter().map(|e| e.display(self)).join(", "))?,
                    n => write!(f, "{n} rows")?,
                }
            }
        };
        Ok(())
    }
}

/// Inverts a Remap targets vector to a vector of source indexes, with None
/// for columns that weren't targeted.
pub fn invert_remap(targets: &[Option<usize>]) -> Vec<Option<usize>> {
    let size = targets.iter().copied().flatten().map(|i| i + 1).max().unwrap_or(0);
    let mut sources = vec![None; size];
    for (from, to) in targets.iter().copied().enumerate() {
        if let Some(to) = to {
            sources[to] = Some(from);
        }
    }
    sources
}


================================================
FILE: src/sql/planner/planner.rs
================================================
use std::collections::{BTreeMap, HashMap, HashSet};

use itertools::{Either, Itertools as _};

use super::plan::{Aggregate, Node, Plan, invert_remap};
use crate::errinput;
use crate::error::Result;
use crate::sql::engine::Catalog;
use crate::sql::parser::ast;
use crate::sql::types::{Column, Expression, Label, Table, Value};

/// The planner builds an execution plan from a parsed Abstract Syntax Tree,
/// using the catalog for schema information.
///
/// To build the plan, it recursively traverses the AST and transforms AST nodes
/// into plan nodes. The planner also resolves column names to column indexes,
/// using a Scope to track currently visible columns and tables at each node.
pub struct Planner<'a, C: Catalog> {
    catalog: &'a C,
}

impl<'a, C: Catalog> Planner<'a, C> {
    /// Creates a new planner.
    pub fn new(catalog: &'a C) -> Self {
        Self { catalog }
    }

    /// Builds a plan for an AST statement.
    pub fn build(&mut self, statement: ast::Statement) -> Result<Plan> {
        use ast::Statement::*;
        match statement {
            CreateTable { name, columns } => self.build_create_table(name, columns),
            DropTable { name, if_exists } => self.build_drop_table(name, if_exists),

            Delete { table, r#where } => self.build_delete(table, r#where),
            Insert { table, columns, values } => self.build_insert(table, columns, values),
            Update { table, set, r#where } => self.build_update(table, set, r#where),
            Select { select, from, r#where, group_by, having, order_by, offset, limit } => {
                self.build_select(select, from, r#where, group_by, having, order_by, offset, limit)
            }

            // Transaction and explain statements are handled by Session.
            Begin { .. } | Commit | Rollback | Explain(_) => {
                panic!("unexpected statement {statement:?}")
            }
        }
    }

    /// Builds a CREATE TABLE plan.
    fn build_create_table(&self, name: String, columns: Vec<ast::Column>) -> Result<Plan> {
        // Most schema validation happens during execution via Table.validate().
        let Some(primary_key) = columns.iter().position(|c| c.primary_key) else {
            return errinput!("no primary key for table {name}");
        };
        if columns.iter().filter(|c| c.primary_key).count() > 1 {
            return errinput!("multiple primary keys for table {name}");
        }
        let columns = columns
            .into_iter()
            .map(|c| {
                let nullable = c.nullable.unwrap_or(!c.primary_key);
                Ok(Column {
                    name: c.name,
                    datatype: c.datatype,
                    nullable,
                    default: match c.default {
                        Some(expr) => Some(Self::build_constant_value(expr)?),
                        None if nullable => Some(Value::Null),
                        None => None,
                    },
                    unique: c.unique || c.primary_key,
                    index: (c.index || c.unique || c.references.is_some()) && !c.primary_key,
                    references: c.references,
                })
            })
            .collect::<Result<_>>()?;
        Ok(Plan::CreateTable { schema: Table { name, primary_key, columns } })
    }

    /// Builds a DROP TABLE plan.
    fn build_drop_table(&self, name: String, if_exists: bool) -> Result<Plan> {
        Ok(Plan::DropTable { name, if_exists })
    }

    /// Builds a DELETE plan.
    fn build_delete(&self, table: String, r#where: Option<ast::Expression>) -> Result<Plan> {
        let table = self.catalog.must_get_table(&table)?;
        let scope = Scope::from_table(&table)?;
        let filter = r#where.map(|expr| Self::build_expression(expr, &scope)).transpose()?;
        Ok(Plan::Delete {
            table: table.name.clone(),
            primary_key: table.primary_key,
            source: Node::Scan { table, alias: None, filter },
        })
    }

    /// Builds an INSERT plan.
    fn build_insert(
        &self,
        table: String,
        columns: Option<Vec<String>>,
        values: Vec<Vec<ast::Expression>>,
    ) -> Result<Plan> {
        let table = self.catalog.must_get_table(&table)?;
        let mut column_map = None;
        if let Some(columns) = columns {
            let column_map = column_map.insert(HashMap::new());
            for (vindex, name) in columns.into_iter().enumerate() {
                let Some(cindex) = table.columns.iter().position(|c| c.name == name) else {
                    return errinput!("unknown column {name} in table {}", table.name);
                };
                if column_map.insert(cindex, vindex).is_some() {
                    return errinput!("column {name} given multiple times");
                }
            }
        }
        let scope = Scope::new();
        let rows = values
            .into_iter()
            .map(|exprs| {
                exprs.into_iter().map(|expr| Self::build_expression(expr, &scope)).collect()
            })
            .try_collect()?;
        Ok(Plan::Insert { table, column_map, source: Node::Values { rows } })
    }

    /// Builds an UPDATE plan.
    fn build_update(
        &self,
        table: String,
        set: BTreeMap<String, Option<ast::Expression>>,
        r#where: Option<ast::Expression>,
    ) -> Result<Plan> {
        let table = self.catalog.must_get_table(&table)?;
        let scope = Scope::from_table(&table)?;
        let filter = r#where.map(|expr| Self::build_expression(expr, &scope)).transpose()?;
        let mut expressions = Vec::with_capacity(set.len());
        for (column, expr) in set {
            let index = scope.lookup_column(None, &column)?;
            let expr = match expr {
                Some(expr) => Self::build_expression(expr, &scope)?,
                None => match &table.columns[index].default {
                    Some(default) => Expression::Constant(default.clone()),
                    None => return errinput!("column {column} has no default value"),
                },
            };
            expressions.push((index, expr));
        }
        Ok(Plan::Update {
            table: table.clone(),
            primary_key: table.primary_key,
            source: Node::Scan { table, alias: None, filter },
            expressions,
        })
    }

    /// Builds a SELECT plan.
    #[allow(clippy::too_many_arguments)]
    fn build_select(
        &self,
        mut select: Vec<(ast::Expression, Option<String>)>,
        from: Vec<ast::From>,
        r#where: Option<ast::Expression>,
        group_by: Vec<ast::Expression>,
        having: Option<ast::Expression>,
        order_by: Vec<(ast::Expression, ast::Direction)>,
        offset: Option<ast::Expression>,
        limit: Option<ast::Expression>,
    ) -> Result<Plan> {
        let mut scope = Scope::new();

        // Build FROM clause.
        let mut node = if !from.is_empty() {
            self.build_from_clause(from, &mut scope)?
        } else {
            // For a constant SELECT, emit a single empty row to project with.
            // This allows using aggregate functions and WHERE as normal.
            Node::Values { rows: vec![vec![]] }
        };

        // Expand out SELECT * to all FROM columns if there are multiple SELECT
        // expressions or a GROUP BY clause (to ensure all columns are in GROUP
        // BY). For simplicity, expressions only supports scalar values, so we
        // special-case the * tuple here.
        if select.contains(&(ast::Expression::All, None)) {
            if node.columns() == 0 {
                return errinput!("SELECT * requires a FROM clause");
            }
            if select.len() > 1 || !group_by.is_empty() {
                select = select
                    .into_iter()
                    .flat_map(|(expr, alias)| match expr {
                        ast::Expression::All => Either::Left(
                            (0..node.columns()).map(|i| (node.column_label(i).into(), None)),
                        ),
                        expr => Either::Right(std::iter::once((expr, alias))),
                    })
                    .collect();
            }
        }

        // Build WHERE clause.
        if let Some(r#where) = r#where {
            let predicate = Self::build_expression(r#where, &scope)?;
            node = Node::Filter { source: Box::new(node), predicate };
        }

        // Build aggregate functions and GROUP BY clause.
        let aggregates = Self::collect_aggregates(&select, &having, &order_by);
        if !group_by.is_empty() || !aggregates.is_empty() {
            node = self.build_aggregate(node, group_by, aggregates, &mut scope)?;
        }

        // Build SELECT clause. We can omit this for a trivial SELECT *.
        if select.as_slice() != [(ast::Expression::All, None)] {
            // Prepare the post-projection scope.
            let mut child_scope = scope.project(&select);

            // Build the SELECT column expressions and aliases.
            let mut expressions = Vec::with_capacity(select.len());
            let mut aliases = Vec::with_capacity(select.len());
            for (expr, alias) in select {
                expressions.push(Self::build_expression(expr, &scope)?);
                aliases.push(Label::from(alias));
            }

            // Add hidden columns for HAVING and ORDER BY columns not in SELECT.
            let hidden = self.build_select_hidden(&having, &order_by, &scope, &mut child_scope);
            aliases.extend(std::iter::repeat_n(Label::None, hidden.len()));
            expressions.extend(hidden);

            scope = child_scope;
            node = Node::Projection { source: Box::new(node), expressions, aliases };
        }

        // Build HAVING clause.
        if let Some(having) = having {
            if scope.aggregates.is_empty() {
                return errinput!("HAVING requires GROUP BY or aggregate function");
            }
            let predicate = Self::build_expression(having, &scope)?;
            node = Node::Filter { source: Box::new(node), predicate };
        }

        // Build ORDER BY clause.
        if !order_by.is_empty() {
            let key = order_by
                .into_iter()
                .map(|(expr, dir)| Ok((Self::build_expression(expr, &scope)?, dir.into())))
                .collect::<Result<_>>()?;
            node = Node::Order { source: Box::new(node), key };
        }

        // Build OFFSET clause.
        if let Some(offset) = offset {
            let offset = match Self::build_constant_value(offset)? {
                Value::Integer(offset) if offset >= 0 => offset as usize,
                offset => return errinput!("invalid offset {offset}"),
            };
            node = Node::Offset { source: Box::new(node), offset }
        }

        // Build LIMIT clause.
        if let Some(limit) = limit {
            let limit = match Self::build_constant_value(limit)? {
                Value::Integer(limit) if limit >= 0 => limit as usize,
                limit => return errinput!("invalid limit {limit}"),
            };
            node = Node::Limit { source: Box::new(node), limit }
        }

        // Remove any hidden columns before emitting the result.
        if let Some(targets) = scope.remap_hidden() {
            node = Node::Remap { source: Box::new(node), targets }
        }

        Ok(Plan::Select(node))
    }

    /// Builds a FROM clause consisting of one or more items. Each item is
    /// either a table or a join of two or more tables. All items are implicitly
    /// joined, e.g. "SELECT * FROM a, b" is an implicit full join of a and b.
    fn build_from_clause(&self, from: Vec<ast::From>, scope: &mut Scope) -> Result<Node> {
        // Build the first FROM item. A FROM clause must have at least one.
        let mut items = from.into_iter();
        let mut node = match items.next() {
            Some(from) => self.build_from(from, scope)?,
            None => return errinput!("no from items given"),
        };

        // Build and implicitly join additional items.
        for from in items {
            let right = self.build_from(from, scope)?;
            node = Node::NestedLoopJoin {
                left: Box::new(node),
                right: Box::new(right),
                predicate: None,
                outer: false,
            };
        }
        Ok(node)
    }

    /// Builds FROM items, which can either be a single table or a chained join
    /// of multiple tables, e.g. "SELECT * FROM a LEFT JOIN b ON b.a_id = a.id".
    fn build_from(&self, from: ast::From, parent_scope: &mut Scope) -> Result<Node> {
        // Each from item is built in its own scope, such that a join node only
        // sees the columns of its children. It's then merged into the parent.
        let mut scope = Scope::new();

        let node = match from {
            // A full table scan.
            ast::From::Table { name, alias } => {
                let table = self.catalog.must_get_table(&name)?;
                scope.add_table(&table, alias.as_deref())?;
                Node::Scan { table, alias, filter: None }
            }

            // A two-way join. The left or right nodes may be chained joins.
            ast::From::Join { mut left, mut right, r#type, predicate } => {
                // Right joins are built as a left join then column swap.
                if r#type == ast::JoinType::Right {
                    (left, right) = (right, left)
                }

                // Build the left and right nodes.
                let left = Box::new(self.build_from(*left, &mut scope)?);
                let right = Box::new(self.build_from(*right, &mut scope)?);
                let (left_size, right_size) = (left.columns(), right.columns());

                // Build the join node.
                let predicate = predicate.map(|e| Self::build_expression(e, &scope)).transpose()?;
                let outer = r#type.is_outer();
                let mut node = Node::NestedLoopJoin { left, right, predicate, outer };

                // For right joins, swap the columns.
                if r#type == ast::JoinType::Right {
                    let size = left_size + right_size;
                    let targets = (0..size).map(|i| Some((i + right_size) % size)).collect_vec();
                    scope = scope.remap(&targets);
                    node = Node::Remap { source: Box::new(node), targets }
                }
                node
            }
        };

        parent_scope.merge(scope)?;
        Ok(node)
    }

    /// Builds an aggregate node, which computes aggregates for a set of GROUP
    /// BY buckets. The aggregate functions have been collected from the SELECT,
    /// HAVING, and ORDER BY clauses.
    ///
    /// The ast::Expression for each aggregate function and GROUP BY expression
    /// is tracked in the Scope and mapped to the column index. Later nodes
    /// (i.e. SELECT, HAVING, and ORDER BY) can look up the column index of
    /// aggregate expressions while building expressions. Consider e.g.:
    ///
    /// SELECT SUM(a) / COUNT(*) FROM t GROUP BY b % 10 HAVING b % 10 >= 5 ORDER BY MAX(c)
    ///
    /// This will build an Aggregate node for SUM(a), COUNT(*), MAX(c) bucketed
    /// by b % 10. The SELECT can look up up SUM(a) and COUNT(*) to compute the
    /// division, and HAVING can look up b % 10 to compute the predicate.
    fn build_aggregate(
        &self,
        source: Node,
        mut group_by: Vec<ast::Expression>,
        mut aggregates: Vec<ast::Expression>,
        scope: &mut Scope,
    ) -> Result<Node> {
        // Construct a child scope with the group_by and aggregate AST
        // expressions, for lookups. Discard duplicate expressions.
        let mut child_scope = scope.spawn();
        group_by.retain(|expr| child_scope.add_aggregate(expr, scope).is_some());
        aggregates.retain(|expr| child_scope.add_aggregate(expr, scope).is_some());

        // Build the node from the remaining unique expressions.
        let group_by =
            group_by.into_iter().map(|expr| Self::build_expression(expr, scope)).try_collect()?;
        let aggregates = aggregates
            .into_iter()
            .map(|expr| Self::build_aggregate_function(expr, scope))
            .try_collect()?;

        *scope = child_scope;
        Ok(Node::Aggregate { source: Box::new(source), group_by, aggregates })
    }

    /// Builds an aggregate function from an AST expression.
    fn build_aggregate_function(expr: ast::Expression, scope: &Scope) -> Result<Aggregate> {
        let ast::Expression::Function(name, mut args) = expr else {
            panic!("aggregate expression must be function");
        };
        if args.len() != 1 {
            return errinput!("{name} takes 1 argument");
        }
        if args[0].contains(&|expr| Self::is_aggregate_function(expr)) {
            return errinput!("aggregate functions can't be nested");
        }
        // Special-case COUNT(*) since expressions don't support tuples.
        let expr = match (name.as_str(), args.remove(0)) {
            ("count", ast::Expression::All) => Expression::Constant(Value::Boolean(true)),
            (_, arg) => Self::build_expression(arg, scope)?,
        };
        Ok(match name.as_str() {
            "avg" => Aggregate::Average(expr),
            "count" => Aggregate::Count(expr),
            "min" => Aggregate::Min(expr),
            "max" => Aggregate::Max(expr),
            "sum" => Aggregate::Sum(expr),
            name => return errinput!("unknown aggregate function {name}"),
        })
    }

    /// Checks whether a given AST expression is an aggregate function.
    fn is_aggregate_function(expr: &ast::Expression) -> bool {
        if let ast::Expression::Function(name, _) = expr {
            return ["avg", "count", "max", "min", "sum"].contains(&name.as_str());
        }
        false
    }

    /// Collects aggregate functions from SELECT, HAVING, and ORDER BY clauses.
    fn collect_aggregates(
        select: &[(ast::Expression, Option<String>)],
        having: &Option<ast::Expression>,
        order_by: &[(ast::Expression, ast::Direction)],
    ) -> Vec<ast::Expression> {
        let select = select.iter().map(|(expr, _)| expr);
        let having = having.iter();
        let order_by = order_by.iter().map(|(expr, _)| expr);
        let mut aggregates = Vec::new();
        for expr in select.chain(having).chain(order_by) {
            expr.collect(&|expr| Self::is_aggregate_function(expr), &mut aggregates)
        }
        aggregates
    }

    /// Builds hidden columns for a projection to pass through columns that are
    /// used by downstream nodes. Consider e.g.:
    ///
    /// SELECT id FROM table ORDER BY value
    ///
    /// The ORDER BY node is evaluated after the SELECT projection (it may need
    /// to order on projected columns), but "value" isn't projected and thus
    /// isn't available to the ORDER BY node. We add a hidden "value" column to
    /// the projection to satisfy the ORDER BY.
    ///
    /// Hidden columns are tracked in the scope and stripped before the result
    /// is returned to the client.
    fn build_select_hidden(
        &self,
        having: &Option<ast::Expression>,
        order_by: &[(ast::Expression, ast::Direction)],
        scope: &Scope,
        child_scope: &mut Scope,
    ) -> Vec<Expression> {
        let mut hidden = Vec::new();
        for expr in having.iter().chain(order_by.iter().map(|(expr, _)| expr)) {
            expr.walk(&mut |expr| {
                // If this is an aggregate or GROUP BY expression that isn't
                // already available in the child scope, add a hidden column.
                if let Some(index) = scope.lookup_aggregate(expr)
                    && child_scope.lookup_aggregate(expr).is_none()
                {
                    child_scope.add_passthrough(scope, index, true);
                    hidden.push(Expression::Column(index));
                    return true;
                }

                // Look for column references that don't exist post-projection,
                // but that do exist in the parent, and add hidden columns.
                let ast::Expression::Column(table, column) = expr else {
                    return true;
                };
                if child_scope.lookup_column(table.as_deref(), column).is_ok() {
                    return true;
                }
                let Ok(index) = scope.lookup_column(table.as_deref(), column) else {
                    // If the parent lookup fails too (i.e. unknown column),
                    // ignore the error. It will be surfaced during building.
                    return true;
                };
                child_scope.add_passthrough(scope, index, true);
                hidden.push(Expression::Column(index));
                true
            });
        }
        hidden
    }

    /// Builds an expression from an AST expression, looking up columns and
    /// aggregate expressions in the scope.
    pub fn build_expression(expr: ast::Expression, scope: &Scope) -> Result<Expression> {
        use Expression::*;

        // Look up aggregate functions or GROUP BY expressions. These were added
        // to the scope when building the Aggregate node, if any.
        if let Some(index) = scope.lookup_aggregate(&expr) {
            return Ok(Column(index));
        }

        // Helper for building a boxed expression.
        let build = |expr: Box<ast::Expression>| -> Result<Box<Expression>> {
            Ok(Box::new(Self::build_expression(*expr, scope)?))
        };

        Ok(match expr {
            // For simplicity, expression evaluation only supports scalar
            // values, not compound types like tuples. Support for * is
            // therefore special-cased in SELECT and COUNT(*).
            ast::Expression::All => return errinput!("unsupported use of *"),
            ast::Expression::Literal(l) => Constant(match l {
                ast::Literal::Null => Value::Null,
                ast::Literal::Boolean(b) => Value::Boolean(b),
                ast::Literal::Integer(i) => Value::Integer(i),
                ast::Literal::Float(f) => Value::Float(f),
                ast::Literal::String(s) => Value::String(s),
            }),
            ast::Expression::Column(table, name) => {
                Column(scope.lookup_column(table.as_deref(), &name)?)
            }
            ast::Expression::Function(name, mut args) => match (name.as_str(), args.len()) {
                // NB: aggregate functions are processed above.
                ("sqrt", 1) => SquareRoot(build(Box::new(args.remove(0)))?),
                (name, n) => return errinput!("unknown function {name} with {n} arguments"),
            },
            ast::Expression::Operator(op) => match op {
                ast::Operator::And(lhs, rhs) => And(build(lhs)?, build(rhs)?),
                ast::Operator::Not(expr) => Not(build(expr)?),
                ast::Operator::Or(lhs, rhs) => Or(build(lhs)?, build(rhs)?),

                ast::Operator::Equal(lhs, rhs) => Equal(build(lhs)?, build(rhs)?),
                ast::Operator::GreaterThan(lhs, rhs) => GreaterThan(build(lhs)?, build(rhs)?),
                ast::Operator::GreaterThanOrEqual(lhs, rhs) => Or(
                    GreaterThan(build(lhs.clone())?, build(rhs.clone())?).into(),
                    Equal(build(lhs)?, build(rhs)?).into(),
                ),
                ast::Operator::Is(expr, literal) => {
                    let expr = build(expr)?;
                    let value = match literal {
                        ast::Literal::Null => Value::Null,
                        ast::Literal::Float(f) if f.is_nan() => Value::Float(f),
                        value => panic!("invalid IS value {value:?}"), // enforced by parser
                    };
                    Is(expr, value)
                }
                ast::Operator::LessThan(lhs, rhs) => LessThan(build(lhs)?, build(rhs)?),
                ast::Operator::LessThanOrEqual(lhs, rhs) => Or(
                    LessThan(build(lhs.clone())?, build(rhs.clone())?).into(),
                    Equal(build(lhs)?, build(rhs)?).into(),
                ),
                ast::Operator::Like(lhs, rhs) => Like(build(lhs)?, build(rhs)?),
                ast::Operator::NotEqual(lhs, rhs) => Not(Equal(build(lhs)?, build(rhs)?).into()),

                ast::Operator::Add(lhs, rhs) => Add(build(lhs)?, build(rhs)?),
                ast::Operator::Divide(lhs, rhs) => Divide(build(lhs)?, build(rhs)?),
                ast::Operator::Exponentiate(lhs, rhs) => Exponentiate(build(lhs)?, build(rhs)?),
                ast::Operator::Factorial(expr) => Factorial(build(expr)?),
                ast::Operator::Identity(expr) => Identity(build(expr)?),
                ast::Operator::Remainder(lhs, rhs) => Remainder(build(lhs)?, build(rhs)?),
                ast::Operator::Multiply(lhs, rhs) => Multiply(build(lhs)?, build(rhs)?),
                ast::Operator::Negate(expr) => Negate(build(expr)?),
                ast::Operator::Subtract(lhs, rhs) => Subtract(build(lhs)?, build(rhs)?),
            },
        })
    }

    /// Builds a constant value from an AST expression by evaluating it. The
    /// expression can't contain column references or aggregate functions.
    fn build_constant_value(expr: ast::Expression) -> Result<Value> {
        Self::build_expression(expr, &Scope::new())?.evaluate(None)
    }
}

/// A scope maps column/table names to input column indexes, for lookups during
/// expression construction. It also tracks aggregate and GROUP BY expressions,
/// as well as hidden columns (e.g. ORDER BY columns that aren't projected in
/// the SELECT clause).
///
/// When building expressions, the scope is used to resolve column names to
/// column indexes, which are placed in the plan and used during execution.
/// Expression evaluation generally happens in the context of an input row. This
/// row may come directly from a single table, or it may be the result of a long
/// chain of joins and projections. The scope keeps track of which columns are
/// currently visible and what names they have.
#[derive(Default)]
pub struct Scope {
    /// The currently visible columns. If empty, only constant expressions can
    /// be used (no column references).
    columns: Vec<Label>,
    /// Index of currently visible tables, by query name (e.g. may be aliased).
    tables: HashSet<String>,
    /// Index of fully qualified table.column names to column indexes. Qualified
    /// names are always unique within a scope.
    qualified: HashMap<(String, String), usize>,
    /// Index of unqualified column names to column indexes. If a name points
    /// to multiple columns, lookups will fail with an ambiguous name error.
    unqualified: HashMap<String, Vec<usize>>,
    /// Index of aggregate and GROUP BY expressions to column indexes. This is
    /// used to track output columns of Aggregate nodes and look them up from
    /// expressions in downstream SELECT, HAVING, and ORDER BY clauses. If the
    /// node contains an (inner) Aggregate node, this is never empty.
    aggregates: HashMap<ast::Expression, usize>,
    /// Hidden columns. These are used to pass e.g. ORDER BY and HAVING
    /// expressions through SELECT projection nodes if the expressions aren't
    /// already projected. They should be removed before emitting results.
    hidden: HashSet<usize>,
}

impl Scope {
    /// Creates a new, empty scope.
    pub fn new() -> Self {
        Self::default()
    }

    /// Creates a scope from a table, using the table's original name.
    fn from_table(table: &Table) -> Result<Self> {
        let mut scope = Self::new();
        scope.add_table(table, None)?;
        Ok(scope)
    }

    /// Creates a new child scope that inherits from the parent scope.
    pub fn spawn(&self) -> Self {
        let mut child = Scope::new();
        child.tables = self.tables.clone(); // retain table names
        child
    }

    /// Adds a table to the scope. The label is either the table's original name
    /// or an alias, and must be unique. All table columns are added, in order.
    fn add_table(&mut self, table: &Table, alias: Option<&str>) -> Result<()> {
        let name = alias.unwrap_or(&table.name);
        if self.tables.contains(name) {
            return errinput!("duplicate table name {name}");
        }
        for column in &table.columns {
            self.add_column(Label::Qualified(name.to_string(), column.name.clone()));
        }
        self.tables.insert(name.to_string());
        Ok(())
    }

    /// Appends a column with the given label to the scope. Returns the column
    /// index.
    fn add_column(&mut self, label: Label) -> usize {
        let index = self.columns.len();
        if let Label::Qualified(table, column) = &label {
            self.qualified.insert((table.clone(), column.clone()), index);
        }
        if let Label::Qualified(_, name) | Label::Unqualified(name) = &label {
            self.unqualified.entry(name.clone()).or_default().push(index)
        }
        self.columns.push(label);
        index
    }

    /// Looks up a column index by name, if possible.
    fn lookup_column(&self, table: Option<&str>, name: &str) -> Result<usize> {
        let fmtname = || table.map(|table| format!("{table}.{name}")).unwrap_or(name.to_string());
        if self.columns.is_empty() {
            return errinput!("expression must be constant, found column {}", fmtname());
        }
        if let Some(table) = table {
            if !self.tables.contains(table) {
                return errinput!("unknown table {table}");
            }
            if let Some(index) = self.qualified.get(&(table.to_string(), name.to_string())) {
                return Ok(*index);
            }
        } else if let Some(indexes) = self.unqualified.get(name) {
            if indexes.len() > 1 {
                return errinput!("ambiguous column {name}");
            }
            return Ok(indexes[0]);
        }
        if !self.aggregates.is_empty() {
            return errinput!(
                "column {} must be used in an aggregate or GROUP BY expression",
                fmtname()
            );
        }
        errinput!("unknown column {}", fmtname())
    }

    /// Adds an aggregate expression to the scope, returning the new column
    /// index or None if the expression already exists. This is either an
    /// aggregate function or a GROUP BY expression, used to look up the
    /// aggregate output column from e.g. SELECT, HAVING, and ORDER BY.
    fn add_aggregate(&mut self, expr: &ast::Expression, parent: &Scope) -> Option<usize> {
        if self.aggregates.contains_key(expr) {
            return None;
        }
        // If this is a simple column reference (i.e. GROUP BY foo), pass
        // through the column label from the parent scope for lookups.
        let mut label = Label::None;
        if let ast::Expression::Column(table, column) = expr {
            // Ignore errors, they will be emitted when building the expression.
            if let Ok(index) = parent.lookup_column(table.as_deref(), column.as_str()) {
                label = parent.columns[index].clone();
            }
        }
        let index = self.add_column(label);
        self.aggregates.insert(expr.clone(), index);
        Some(index)
    }

    /// Looks up an aggregate column index by aggregate function or GROUP BY
    /// expression.
    fn lookup_aggregate(&self, expr: &ast::Expression) -> Option<usize> {
        self.aggregates.get(expr).copied()
    }

    /// Adds a column that passes through a column from the parent scope,
    /// retaining its properties. If hide is true, the column is hidden.
    fn add_passthrough(&mut self, parent: &Scope, parent_index: usize, hide: bool) -> usize {
        let index = self.add_column(parent.columns[parent_index].clone());
        for (expr, i) in &parent.aggregates {
            if *i == parent_index {
                self.aggregates.entry(expr.clone()).or_insert(index);
            }
        }
        if hide || parent.hidden.contains(&parent_index) {
            self.hidden.insert(index);
        }
        index
    }

    /// Merges two scopes, by appending the given scope to self.
    fn merge(&mut self, scope: Scope) -> Result<()> {
        for table in scope.tables {
            if self.tables.contains(&table) {
                return errinput!("duplicate table name {table}");
            }
            self.tables.insert(table);
        }
        let offset = self.columns.len();
        for label in scope.columns {
            self.add_column(label);
        }
        for (expr, index) in scope.aggregates {
            self.aggregates.entry(expr).or_insert(index + offset);
        }
        self.hidden.extend(scope.hidden.into_iter().map(|index| index + offset));
        Ok(())
    }

    /// Projects the scope via the given expressions and aliases, creating a new
    /// child scope with one column per expression. These may be a simple column
    /// reference (e.g. "SELECT a, b FROM table"), which passes through the
    /// corresponding column from the original scope and retains its qualified
    /// and unqualified names. Otherwise, for non-trivial column references, a
    /// new column is created for the expression. Explicit aliases may be given.
    fn project(&self, expressions: &[(ast::Expression, Option<String>)]) -> Self {
        let mut child = self.spawn();
        for (expr, alias) in expressions {
            // Use the alias if given, or look up any column references.
            let mut label = Label::None;
            if let Some(alias) = alias {
                label = Label::Unqualified(alias.clone());
            } else if let ast::Expression::Column(table, column) = expr {
                // Ignore errors, they will be surfaced in build_expression().
                if let Ok(index) = self.lookup_column(table.as_deref(), column.as_str()) {
                    label = self.columns[index].clone();
                }
            }
            let index = child.add_column(label);
            // If this is an aggregate query, then all projected expressions
            // must also be aggregates by definition (an aggregate node can only
            // emit aggregate functions or GROUP BY expressions).
            if !self.aggregates.is_empty() {
                child.aggregates.entry(expr.clone()).or_insert(index);
            }
        }
        child
    }

    /// Remaps the scope using the given targets.
    fn remap(&self, targets: &[Option<usize>]) -> Self {
        let mut child = self.spawn();
        for index in invert_remap(targets).into_iter().flatten() {
            child.add_passthrough(self, index, false);
        }
        child
    }

    /// Removes hidden columns from the scope, returning their indexes or None
    /// if no columns are hidden.
    fn remove_hidden(&mut self) -> Option<HashSet<usize>> {
        if self.hidden.is_empty() {
            return None;
        }
        let hidden = std::mem::take(&mut self.hidden);
        let mut index = 0;
        self.columns.retain(|_| {
            let retain = !hidden.contains(&index);
            index += 1;
            retain
        });
        self.qualified.retain(|_, index| !hidden.contains(index));
        self.unqualified.iter_mut().for_each(|(_, vec)| vec.retain(|i| !hidden.contains(i)));
        self.unqualified.retain(|_, vec| !vec.is_empty());
        self.aggregates.retain(|_, index| !hidden.contains(index));
        Some(hidden)
    }

    /// Removes hidden columns from the scope and returns the remaining column
    /// indexes as a Remap targets vector, or None if no columns are hidden. A
    /// Remap targets vector maps parent column indexes to child column indexes,
    /// or None if a column should be dropped.
    fn remap_hidden(&mut self) -> Option<Vec<Option<usize>>> {
        let size = self.columns.len();
        let hidden = self.remove_hidden()?;
        let mut targets = vec![None; size];
        let mut index = 0;
        for (old_index, target) in targets.iter_mut().enumerate() {
            if !hidden.contains(&old_index) {
                *target = Some(index);
                index += 1;
            }
        }
        Some(targets)
    }
}


================================================
FILE: src/sql/testscripts/expressions/cnf
================================================
# Tests conversion of logical expressions into canonical normal form.

# Noop for non-boolean expressions.
[cnf]> 1 + 2
---
3 ← 1 + 2

# Applies De Morgan's laws.
[cnf]> NOT (TRUE AND FALSE)
---
TRUE ← NOT TRUE OR NOT FALSE

[cnf]> NOT (TRUE OR FALSE)
---
FALSE ← NOT TRUE AND NOT FALSE

# NOTs are pushed into the expression.
[cnf]> NOT (TRUE AND TRUE AND TRUE OR TRUE)
---
FALSE ← (NOT TRUE OR NOT TRUE OR NOT TRUE) AND NOT TRUE

# ORs are converted to ANDs by the distributive law.
[cnf]> (TRUE AND FALSE) OR (FALSE AND TRUE)
---
FALSE ← (TRUE OR FALSE) AND (TRUE OR TRUE) AND (FALSE OR FALSE) AND (FALSE OR TRUE)

# This is also true when combined with De Morgan's laws.
[cnf]> NOT ((TRUE OR FALSE) AND (TRUE OR FALSE))
---
FALSE ← (NOT TRUE OR NOT TRUE) AND (NOT TRUE OR NOT FALSE) AND (NOT FALSE OR NOT TRUE) AND (NOT FALSE OR NOT FALSE)


================================================
FILE: src/sql/testscripts/expressions/func
================================================
# Tests function calls.

# Function names are case-insensitive.
> sqrt(1)
> SQRT(1)
---
1.0
1.0

# A space is allowed around the arguments.
> sqrt ( 1 )
---
1.0

# Wrong number of arguments errors.
!> sqrt()
!> sqrt(1, 2)
---
Error: invalid input: unknown function sqrt with 0 arguments
Error: invalid input: unknown function sqrt with 2 arguments

# Unknown functions error.
!> unknown()
!> unknown(1, 2, 3)
---
Error: invalid input: unknown function unknown with 0 arguments
Error: invalid input: unknown function unknown with 3 arguments

# Parse errors.
!> unknown(1, 2, 3
!> unknown(1, 2, 3,)
!> unknown(1, 2 3)
---
Error: invalid input: unexpected end of input
Error: invalid input: expected expression atom, found )
Error: invalid input: expected token ,, found 3


================================================
FILE: src/sql/testscripts/expressions/func_sqrt
================================================
# Tests sqrt().

# Integers work, and return floats.
[expr]> sqrt(2)
[expr]> sqrt(100)
---
1.4142135623730951 ← SquareRoot(Constant(Integer(2)))
10.0 ← SquareRoot(Constant(Integer(100)))

# Negative integers error, but 0 is valid.
!> sqrt(-1)
> sqrt(0)
---
Error: invalid input: can't take square root of -1
0.0

# Floats work.
> sqrt(3.14)
> sqrt(100.0)
---
1.772004514666935
10.0

# Negative floats work, but return NAN.
> sqrt(-1.0)
---
NaN

# Test various special float values.
> sqrt(-0.0)
> sqrt(0.0)
> sqrt(NAN)
> sqrt(INFINITY)
> sqrt(-INFINITY)
---
-0.0
0.0
NaN
inf
NaN

# NULL is passed through.
> sqrt(NULL)
---
NULL

# Strings and booleans error.
!> sqrt(TRUE)
!> sqrt('foo')
---
Error: invalid input: can't take square root of TRUE
Error: invalid input: can't take square root of 'foo'


================================================
FILE: src/sql/testscripts/expressions/literals
================================================
# Tests parsing and evaluation of literals and constants.

# Boolean and float constants.
true
false
null
infinity
nan
---
TRUE
FALSE
NULL
inf
NaN

# Constants are case-insensitive.
NULL
NaN
---
NULL
NaN

# Integers.
3
314
03
---
3
314
3

# Floats with decimal points.
3.72
3.
3.0
---
3.72
3.0
3.0

# Negative or explicit positive numbers are parsed as prefix operators.
[expr]> -3
[expr]> +3
[expr]> -3.14
[expr]> +3.14
---
-3 ← Negate(Constant(Integer(3)))
3 ← Identity(Constant(Integer(3)))
-3.14 ← Negate(Constant(Float(3.14)))
3.14 ← Identity(Constant(Float(3.14)))

# Floats with exponents.
3.14e3
2.718E-2
---
3140.0
0.02718

# Integer overflow/underflow.
>  9223372036854775807
!> 9223372036854775808
>  -9223372036854775807
!> -9223372036854775808
---
9223372036854775807
Error: invalid input: number too large to fit in target type
-9223372036854775807
Error: invalid input: number too large to fit in target type

# Float overflow/underflow.
> 1.23456789012345e308
> 1e309
> -1.23456789012345e308
> -1e309
---
1.23456789012345e308
inf
-1.23456789012345e308
-inf

# Float precision.
> 1.23456789012345e-307
> -1.23456789012345e-307
> 1.23456789012345e-323
> 0.12345678901234567890
> 1e-325
---
1.23456789012345e-307
-1.23456789012345e-307
1e-323
0.12345678901234568
0.0

# Strings, using single quotes. Only '' is supported as an escape sequence.
> 'Hi! 👋'
> 'Has ''single'' and "double" quotes'
> 'Try \n newlines and \t tabs'
---
'Hi! 👋'
'Has \'single\' and \"double\" quotes'
'Try \\n newlines and \\t tabs'

# Double quotes are identifiers, not string literals. This fails to evaluate as
# a constant expression.
!> "Hi!"
---
Error: invalid input: expression must be constant, found column Hi!


================================================
FILE: src/sql/testscripts/expressions/op_compare_equal
================================================
# Tests the = equality operator.

# Booleans.
> TRUE = TRUE
> TRUE = FALSE
> FALSE = TRUE
---
TRUE
FALSE
FALSE

# Integers.
> 1 = 1
> 1 = 2
---
TRUE
FALSE

# Floats.
> 3.14 = 3.14
> 3.14 = 2.718
---
TRUE
FALSE

# Float special values.
> 0.0 = -0.0
> INFINITY = INFINITY
> NAN = NAN
---
TRUE
TRUE
FALSE

# Mixed integers and floats.
> 3.0 = 3
> 3.01 = 3
> 3 = 3.01
> -0.0 = 0
---
TRUE
FALSE
FALSE
TRUE

# Strings.
> 'abc' = 'abc'
> 'abc' = 'ab'
> 'abc' = 'abcd'
> 'abc' = 'ABC'
> '😀' = '😀'
> '😀' = '🙁'
---
TRUE
FALSE
FALSE
FALSE
TRUE
FALSE

# NULLs.
> 1 = NULL
> 3.14 = NULL
> FALSE = NULL
> '' = NULL
> NULL = NULL
> NAN = NULL
> INFINITY = NULL
---
NULL
NULL
NULL
NULL
NULL
NULL
NULL

# Type mismatches.
!> true = 1
!> 'true' = true
---
Error: invalid input: can't compare TRUE and 1
Error: invalid input: can't compare 'true' and TRUE


================================================
FILE: src/sql/testscripts/expressions/op_compare_greater
================================================
# Tests the > greater than operator.

# Booleans.
> TRUE > FALSE
> FALSE > TRUE
> TRUE > TRUE
> FALSE > FALSE
---
TRUE
FALSE
FALSE
FALSE

# Integers.
> 3 > 2
> 3 > 3
> 3 > 4
> -1 > 0
> 0 > -1
---
TRUE
FALSE
FALSE
FALSE
TRUE

# Floats.
> 3.14 > 3.13
> 3.14 > 3.14
> 3.14 > 3.15
> 0.0 > -0.0
---
TRUE
FALSE
FALSE
FALSE

# Float special values.
> INFINITY > 1e300
> INFINITY > INFINITY
> INFINITY > -INFINITY
> NAN > NAN
> NAN > INFINITY
> INFINITY > NAN
> NAN > 0.0
---
TRUE
FALSE
TRUE
FALSE
FALSE
FALSE
FALSE

# Mixed integer/float values.
> 3 > 3.0
> 3 > 2.9
> 3 > 3.1
> 0 > -0.0
---
FALSE
TRUE
FALSE
FALSE

# Strings.
> 'abc' > 'abc'
> 'abc' > 'abb'
> 'abc' > 'ab'
> 'b' > 'abc'
---
FALSE
TRUE
TRUE
TRUE

# Empty strings.
> '' > ''
> 'a' > ''
> '' > 'a'
---
FALSE
TRUE
FALSE

# String case comparisons.
> 'a' > 'B'
> 'z' > 'B'
> 'A' > 'b'
> 'Z' > 'b'
---
TRUE
TRUE
FALSE
FALSE

# Unicode strings.
> '🙁' > '😀'
> '😀' > '😀'
> '😀' > '🙁'
---
TRUE
FALSE
FALSE

# NULLs.
> TRUE > NULL
> NULL > TRUE
> 1 > NULL
> NULL > 1
> 3.14 > NULL
> NULL > 3.14
> '' > NULl
> NULL > ''
> NULL > NULL
> NULL > NAN
---
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL

# Type conflicts.
!> TRUE > 1
!> TRUE > ''
!> '' > 1
---
Error: invalid input: can't compare TRUE and 1
Error: invalid input: can't compare TRUE and ''
Error: invalid input: can't compare '' and 1


================================================
FILE: src/sql/testscripts/expressions/op_compare_greater_equal
================================================
# Tests the >= greater than operator.

# This is implemented as > OR =, just verify this for a few basic cases.

[expr]> 0 >= 1
[expr]> 0 >= 0
[expr]> 0 >= -1
---
FALSE ← Or(GreaterThan(Constant(Integer(0)), Constant(Integer(1))), Equal(Constant(Integer(0)), Constant(Integer(1))))
TRUE ← Or(GreaterThan(Constant(Integer(0)), Constant(Integer(0))), Equal(Constant(Integer(0)), Constant(Integer(0))))
TRUE ← Or(GreaterThan(Constant(Integer(0)), Negate(Constant(Integer(1)))), Equal(Constant(Integer(0)), Negate(Constant(Integer(1)))))

[expr]> -0.0 >= 0.0
[expr]> INFINITY >= INFINITY
[expr]> NAN >= NAN
---
TRUE ← Or(GreaterThan(Negate(Constant(Float(0.0))), Constant(Float(0.0))), Equal(Negate(Constant(Float(0.0))), Constant(Float(0.0))))
TRUE ← Or(GreaterThan(Constant(Float(inf)), Constant(Float(inf))), Equal(Constant(Float(inf)), Constant(Float(inf))))
FALSE ← Or(GreaterThan(Constant(Float(NaN)), Constant(Float(NaN))), Equal(Constant(Float(NaN)), Constant(Float(NaN))))

[expr]> NULL >= 1
[expr]> NULL >= NAN
[expr]> NULL >= NULL
---
NULL ← Or(GreaterThan(Constant(Null), Constant(Integer(1))), Equal(Constant(Null), Constant(Integer(1))))
NULL ← Or(GreaterThan(Constant(Null), Constant(Float(NaN))), Equal(Constant(Null), Constant(Float(NaN))))
NULL ← Or(GreaterThan(Constant(Null), Constant(Null)), Equal(Constant(Null), Constant(Null)))


================================================
FILE: src/sql/testscripts/expressions/op_compare_is_nan
================================================
# Tests the IS NAN equality operator.

> 0.0 IS NAN
> NAN IS NAN
> NULL IS NAN
---
FALSE
TRUE
NULL

!> FALSE IS NAN
!> 0 IS NAN
!> '' IS NAN
!> 'nan' IS NAN
---
Error: invalid input: IS NAN can't be used with BOOLEAN
Error: invalid input: IS NAN can't be used with INTEGER
Error: invalid input: IS NAN can't be used with STRING
Error: invalid input: IS NAN can't be used with STRING


================================================
FILE: src/sql/testscripts/expressions/op_compare_is_null
================================================
# Tests the IS NULL equality operator.

> FALSE IS NULL
> 0 IS NULL
> 0.0 IS NULL
> '' IS NULL
> 'null' IS NULL
> NAN IS NULL
> NULL IS NULL
---
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
TRUE


================================================
FILE: src/sql/testscripts/expressions/op_compare_lesser
================================================
# Tests the < less than operator.

# Booleans.
> FALSE < TRUE
> TRUE < FALSE
> TRUE < TRUE
> FALSE < FALSE
---
TRUE
FALSE
FALSE
FALSE

# Integers.
> 3 < 2
> 3 < 3
> 3 < 4
> -1 < 0
> 0 < -1
---
FALSE
FALSE
TRUE
TRUE
FALSE

# Floats.
> 3.14 < 3.13
> 3.14 < 3.14
> 3.14 < 3.15
> -0.0 < 0.0
---
FALSE
FALSE
TRUE
FALSE

# Float special values.
> 1e300 < INFINITY
> INFINITY < INFINITY
> -INFINITY < INFINITY
> NAN < NAN
> NAN < INFINITY
> INFINITY < NAN
> 0.0 < NAN
---
TRUE
FALSE
TRUE
FALSE
FALSE
FALSE
FALSE

# Mixed integer/float values.
> 3 < 2.9
> 3 < 3.0
> 3 < 3.1
> -0.0 < 0
---
FALSE
FALSE
TRUE
FALSE

# Strings.
> 'abc' < 'abc'
> 'abb' < 'abc'
> 'ab' < 'abc'
> 'abc' < 'b'
---
FALSE
TRUE
TRUE
TRUE

# Empty strings.
> '' < ''
> '' < 'a'
> 'a' < ''
---
FALSE
TRUE
FALSE

# String case comparisons.
> 'B' < 'a'
> 'B' < 'z'
> 'B' < 'A'
> 'B' < 'Z'
---
TRUE
TRUE
FALSE
TRUE

# Unicode strings.
> '😀' < '🙁' 
> '😀' < '😀' 
> '🙁' < '😀' 
---
TRUE
FALSE
FALSE

# NULLs.
> TRUE < NULL
> NULL < TRUE
> 1 < NULL
> NULL < 1
> 3.14 < NULL
> NULL < 3.14
> '' < NULl
> NULL < ''
> NULL < NULL
> NULL < NAN
---
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL
NULL

# Type conflicts.
!> TRUE < 1
!> TRUE < ''
!> '' < 1
---
Error: invalid input: can't compare TRUE and 1
Error: invalid input: can't compare TRUE and ''
Error: invalid input: can't compare '' and 1


================================================
FILE: src/sql/testscripts/expressions/op_compare_lesser_equal
================================================
# Tests the <= less than or equal operator.

# This is implemented as < OR =, just verify this for a few basic cases.

[expr]> 1 <= 0
[expr]> 0 <= 0
[expr]> -1 <= 0
---
FALSE ← Or(LessThan(Constant(Integer(1)), Constant(Integer(0))), Equal(Constant(Integer(1)), Constant(Integer(0))))
TRUE ← Or(LessThan(Constant(Integer(0)), Constant(Integer(0))), Equal(Constant(Integer(0)), Constant(Integer(0))))
TRUE ← Or(LessThan(Negate(Constant(Integer(1))), Constant(Integer(0))), Equal(Negate(Constant(Integer(1))), Constant(Integer(0))))

[expr]> 0.0 <= -0.0
[expr]> INFINITY <= INFINITY
[expr]> NAN <= NAN
---
TRUE ← Or(LessThan(Constant(Float(0.0)), Negate(Constant(Float(0.0)))), Equal(Constant(Float(0.0)), Negate(Constant(Float(0.0)))))
TRUE ← Or(LessThan(Constant(Float(inf)), Constant(Float(inf))), Equal(Constant(Float(inf)), Constant(Float(inf))))
FALSE ← Or(LessThan(Constant(Float(NaN)), Constant(Float(NaN))), Equal(Constant(Float(NaN)), Constant(Float(NaN))))

[expr]> NULL <= 1
[expr]> NULL <= NAN
[expr]> NULL <= NULL
---
NULL ← Or(LessThan(Constant(Null), Constant(Integer(1))), Equal(Constant(Null), Constant(Integer(1))))
NULL ← Or(LessThan(Constant(Null), Constant(Float(NaN))), Equal(Constant(Null), Constant(Float(NaN))))
NULL ← Or(LessThan(Constant(Null), Constant(Null)), Equal(Constant(Null), Constant(Null)))


================================================
FILE: src/sql/testscripts/expressions/op_compare_not_equal
================================================
# Tests the != inequality operator.

# != is a combination of NOT and =, just verify that for a few basic cases.

[expr]> 1 != 1
[expr]> 1 != 3
[expr]> 1 != NULL
---
FALSE ← Not(Equal(Constant(Integer(1)), Constant(Integer(1))))
TRUE ← Not(Equal(Constant(Integer(1)), Constant(Integer(3))))
NULL ← Not(Equal(Constant(Integer(1)), Constant(Null)))

[expr]> 3.0 != 3
[expr]> 0.0 != -0.0
---
FALSE ← Not(Equal(Constant(Float(3.0)), Constant(Integer(3))))
FALSE ← Not(Equal(Constant(Float(0.0)), Negate(Constant(Float(0.0)))))

[expr]> NAN != NAN
[expr]> INFINITY != INFINITY
[expr]> NULL != NULL
---
TRUE ← Not(Equal(Constant(Float(NaN)), Constant(Float(NaN))))
FALSE ← Not(Equal(Constant(Float(inf)), Constant(Float(inf))))
NULL ← Not(Equal(Constant(Null), Constant(Null)))


================================================
FILE: src/sql/testscripts/expressions/op_logic_and
================================================
# Tests the AND logical operator.

# Basic truth table.
> TRUE AND TRUE
> TRUE AND FALSE
> FALSE AND TRUE
> FALSE AND FALSE
---
TRUE
FALSE
FALSE
FALSE

# Trinary logic.
> TRUE AND NULL
> NULL AND TRUE
> FALSE AND NULL
> NULL AND FALSE
> NULL AND NULL
---
NULL
NULL
FALSE
FALSE
NULL

# Non-booleans.
!> 1 AND TRUE
!> TRUE AND 1
!> 1 AND 1
!> 3.14 AND TRUE
!> TRUE AND 3.14
!> 3.14 AND 3.14
!> 'true' AND TRUE
!> TRUE AND 'true'
!> 'true' AND 'true'
---
Error: invalid input: can't AND 1 and TRUE
Error: invalid input: can't AND TRUE and 1
Error: invalid input: can't AND 1 and 1
Error: invalid input: can't AND 3.14 and TRUE
Error: invalid input: can't AND TRUE and 3.14
Error: invalid input: can't AND 3.14 and 3.14
Error: invalid input: can't AND 'true' and TRUE
Error: invalid input: can't AND TRUE and 'true'
Error: invalid input: can't AND 'true' and 'true'


================================================
FILE: src/sql/testscripts/expressions/op_logic_not
================================================
# Tests the NOT logical operator.

> NOT TRUE
> NOT FALSE
> NOT NULL
---
FALSE
TRUE
NULL

# Non-booleans.
!> NOT 1
!> NOT 3.14
!> NOT 'true'
---
Error: invalid input: can't NOT 1
Error: invalid input: can't NOT 3.14
Error: invalid input: can't NOT 'true'


================================================
FILE: src/sql/testscripts/expressions/op_logic_or
================================================
# Tests the OR logical operator.

# Basic truth table.
> TRUE OR TRUE
> TRUE OR FALSE
> FALSE OR TRUE
> FALSE OR FALSE
---
TRUE
TRUE
TRUE
FALSE

# Trinary logic.
> TRUE OR NULL
> NULL OR TRUE
> FALSE OR NULL
> NULL OR FALSE
> NULL OR NULL
---
TRUE
TRUE
NULL
NULL
NULL

# Non-booleans.
!> 1 OR TRUE
!> TRUE OR 1
!> 1 OR 1
!> 3.14 OR TRUE
!> TRUE OR 3.14
!> 3.14 OR 3.14
!> 'true' OR TRUE
!> TRUE OR 'true'
!> 'true' OR 'true'
---
Error: invalid input: can't OR 1 and TRUE
Error: invalid input: can't OR TRUE and 1
Error: invalid input: can't OR 1 and 1
Error: invalid input: can't OR 3.14 and TRUE
Error: invalid input: can't OR TRUE and 3.14
Error: invalid input: can't OR 3.14 and 3.14
Error: invalid input: can't OR 'true' and TRUE
Error: invalid input: can't OR TRUE and 'true'
Error: invalid input: can't OR 'true' and 'true'


================================================
FILE: src/sql/testscripts/expressions/op_math_add
================================================
# Tests the + addition operator.

# Simple integer addition.
[expr]> 1 + 2
[expr]> 1 + -3
[expr]> 1 + -2 + 3
---
3 ← Add(Constant(Integer(1)), Constant(Integer(2)))
-2 ← Add(Constant(Integer(1)), Negate(Constant(Integer(3))))
2 ← Add(Add(Constant(Integer(1)), Negate(Constant(Integer(2)))), Constant(Integer(3)))

# Simple float addition.
[expr]> 3.1 + 2.71
[expr]> 3.1 + -2.71
---
5.8100000000000005 ← Add(Constant(Float(3.1)), Constant(Float(2.71)))
0.3900000000000001 ← Add(Constant(Float(3.1)), Negate(Constant(Float(2.71))))

# Combined int/float addition yields floats.
> 3.72 + 1
> 1 + 3.72
> 1 + 3.0
> -1 + 3.72
---
4.720000000000001
4.720000000000001
4.0
2.72

# Addition with nulls yields null.
> 1 + NULL
> NULL + 3.14
> NULL + NULL
---
NULL
NULL
NULL

# Addition with infinity and NaN.
> 1 + INFINITY
> 1 + -INFINITY
> -1 + INFINITY
> 1 + NAN
> 3.14 + -NAN
> INFINITY + NAN
---
inf
-inf
inf
NaN
NaN
NaN

# Overflow and underflow.
!> 9223372036854775807 + 1
!> -9223372036854775807 + -2
> 9223372036854775807 + 1.0
> 2e308 + 2e308
---
Error: invalid input: integer overflow
Error: invalid input: integer overflow
9.223372036854776e18
inf

# Bools and strings error.
!> TRUE + FALSE
!> 'a' + 'b'
---
Error: invalid input: can't add TRUE and FALSE
Error: invalid input: can't add 'a' and 'b'


================================================
FILE: src/sql/testscripts/expressions/op_math_divide
================================================
# Tests the / division operator.

# Integers.
[expr]> 9 / 3
[expr]> 8 / 3
[expr]> 8 / -3
---
3 ← Divide(Constant(Integer(9)), Constant(Integer(3)))
2 ← Divide(Constant(Integer(8)), Constant(Integer(3)))
-2 ← Divide(Constant(Integer(8)), Negate(Constant(Integer(3))))

# Floats.
[expr]> 4.16 / 3.2
[expr]> 4.16 / -3.2
---
1.3 ← Divide(Constant(Float(4.16)), Constant(Float(3.2)))
-1.3 ← Divide(Constant(Float(4.16)), Negate(Constant(Float(3.2))))

# Mixed always yields floats.
> 3 / 1.2
> 1.2 / 3
> 9.0 / 3
> 0.0 / 1
---
2.5
0.39999999999999997
3.0
0.0

# Division by zero errors for integers, yields infinity or nan for floats.
!> 1 / 0
!> 0 / 0
!> -1 / 0
> 1.0 / 0.0
> 0.0 / 0.0
> -1.0 / 0.0
> 1.0 / -0.0
---
Error: invalid input: can't divide by zero
Error: invalid input: can't divide by zero
Error: invalid input: can't divide by zero
inf
NaN
-inf
-inf

# Division with NULL always yields NULL.
> 1 / NULL
> NULL / 1
> 1.0 / NULL
> NULL / 1.0
> NULL / NULL
> NULL / 0
---
NULL
NULL
NULL
NULL
NULL
NULL

# Division by infinity.
> 3.14 / INFINITY
> 3.14 / -INFINITY
> -3.14 / INFINITY
> INFINITY / 10
> 0 / INFINITY
> INFINITY / 0.0
> INFINITY / INFINITY
> -INFINITY / -INFINITY
---
0.0
-0.0
-0.0
inf
0.0
inf
NaN
NaN

# Division by NaN.
> 1 / NAN
> NAN / 1
> NAN / NAN
> NAN / 0
---
NaN
NaN
NaN
NaN

# Bools and strings error.
!> TRUE / FALSE
!> 'a' / 'b'
---
Error: invalid input: can't divide TRUE and FALSE
Error: invalid input: can't divide 'a' and 'b'


================================================
FILE: src/sql/testscripts/expressions/op_math_exponentiate
================================================
# Tests the ^ exponentiation operator.

# Integers.
[expr]> 2 ^ 3
[expr]> 2 ^ 0
[expr]> 0 ^ 2
[expr]> 9 ^ -3
---
8 ← Exponentiate(Constant(Integer(2)), Constant(Integer(3)))
1 ← Exponentiate(Constant(Integer(2)), Constant(Integer(0)))
0 ← Exponentiate(Constant(Integer(0)), Constant(Integer(2)))
0.0013717421124828531 ← Exponentiate(Constant(Integer(9)), Negate(Constant(Integer(3))))

# Floats.
[expr]> 6.25 ^ 0.5
[expr]> 6.25 ^ 3.14
---
2.5 ← Exponentiate(Constant(Float(6.25)), Constant(Float(0.5)))
315.5464179407336 ← Exponentiate(Constant(Float(6.25)), Constant(Float(3.14)))

# Mixed.
> 6.25 ^ 2
> 9 ^ 0.5
---
39.0625
3.0

# Overflow and underflow.
!> 2 ^ 10000000000
!> 9223372036854775807 ^ 2
> 10e200 ^ 2
---
Error: invalid input: integer overflow
Error: invalid input: integer overflow
inf

# Nulls.
> 1 ^ NULL
> 3.14 ^ NULL
> NULL ^ 2
> NULL ^ 3.14
> NULL ^ NULL
---
NULL
NULL
NULL
NULL
NULL

# Infinity and NaN.
> 2 ^ INFINITY
> INFINITY ^ 2
> INFINITY ^ INFINITY
> 2 ^ -INFINITY
> 2 ^ NAN
> NAN ^ 2
> NAN ^ NAN
---
inf
inf
inf
0.0
NaN
NaN
NaN

# Bools and strings.
!> TRUE ^ FALSE
!> 'a' ^ 'b'
---
Error: invalid input: can't exponentiate TRUE and FALSE
Error: invalid input: can't exponentiate 'a' and 'b'

# Right-associativity.
[expr]> 2 ^ 3 ^ 2
[expr]> 2 ^ 1 ^ 2 ^ 3
---
512 ← Exponentiate(Constant(Integer(2)), Exponentiate(Constant(Integer(3)), Constant(Integer(2))))
2 ← Exponentiate(Constant(Integer(2)), Exponentiate(Constant(Integer(1)), Exponentiate(Constant(Integer(2)), Constant(Integer(3)))))


================================================
FILE: src/sql/testscripts/expressions/op_math_factorial
================================================
# Tests the ! factorial suffix operator.

# Integer works.
[expr]> 3!
---
6 ← Factorial(Constant(Integer(3)))

# But float, bool, and string fails.
!> 3.14!
!> 3.0!
!> TRUE!
!> 'a'!
---
Error: invalid input: can't take factorial of 3.14
Error: invalid input: can't take factorial of 3.0
Error: invalid input: can't take factorial of TRUE
Error: invalid input: can't take factorial of 'a'

# 0 factorial is 1, but negative factorial errors.
> -0!
!> -1!
---
1
Error: invalid input: can't take factorial of -1

# NULL yields null, infinity and NaN error.
> NULL!
!> INFINITY!
!> NAN!
---
NULL
Error: invalid input: can't take factorial of inf
Error: invalid input: can't take factorial of NaN

# Multiple applications work.
[expr]> 3!!
[expr]> 2!!!!!!
---
720 ← Factorial(Factorial(Constant(Integer(3))))
2 ← Factorial(Factorial(Factorial(Factorial(Factorial(Factorial(Constant(Integer(2))))))))

# Overflow.
[expr]!> 3!!!
---
Error: invalid input: integer overflow


================================================
FILE: src/sql/testscripts/expressions/op_math_identity
================================================
# Tests the + identity prefix operator.

# Integer and float works.
[expr]> +1
[expr]> +3.14
---
1 ← Identity(Constant(Integer(1)))
3.14 ← Identity(Constant(Float(3.14)))

# NULL, infinity and NaN.
> +NULL
> +INFINITY
> +NAN
---
NULL
inf
NaN

# Multiple applications work.
[expr]> +++1
---
1 ← Identity(Identity(Identity(Constant(Integer(1)))))

# Bool and string fails.
!> +TRUE
!> +'a'
---
Error: invalid input: can't take the identity of TRUE
Error: invalid input: can't take the identity of 'a'


================================================
FILE: src/sql/testscripts/expressions/op_math_multiply
================================================
# Tests the * multiplication operator.

# Integers.
[expr]> 2 * 3
[expr]> 2 * -3
---
6 ← Multiply(Constant(Integer(2)), Constant(Integer(3)))
-6 ← Multiply(Constant(Integer(2)), Negate(Constant(Integer(3))))

# Float.
[expr]> 3.14 * 2.71
[expr]> 3.14 * -2.71
---
8.5094 ← Multiply(Constant(Float(3.14)), Constant(Float(2.71)))
-8.5094 ← Multiply(Constant(Float(3.14)), Negate(Constant(Float(2.71))))

# Mixed.
> 3.14 * 2
> -2 * 3.14
---
6.28
-6.28

# Integer and float overflow, underflow, and precision loss.
!> 9223372036854775807 * 2
!> 9223372036854775807 * -2
> 2e308 * 2
> 9223372036854775807 * 2.0
---
Error: invalid input: integer overflow
Error: invalid input: integer overflow
inf
1.8446744073709552e19


# NULLs always yield NULL.
> 1 * NULL
> NULL * 3.14
> NULL * NULL
---
NULL
NULL
NULL

# Infinity.
> 2 * INFINITY
> -2 * INFINITY
> 3.14 * -INFINITY
> INFINITY * INFINITY
> INFINITY * -INFINITY
---
inf
-inf
-inf
inf
-inf

# NaN.
> 2 * NAN
> -3.14 * NAN
> INFINITY * NAN
> NAN * NAN
---
NaN
NaN
NaN
NaN

# Bools and strings.
!> TRUE * FALSE
!> 'a' * 'b'
---
Error: invalid input: can't multiply TRUE and FALSE
Error: invalid input: can't multiply 'a' and 'b'


================================================
FILE: src/sql/testscripts/expressions/op_math_negate
================================================
# Tests the - negation prefix operator.

# Integer and float works.
[expr]> -1
[expr]> -3.14
---
-1 ← Negate(Constant(Integer(1)))
-3.14 ← Negate(Constant(Float(3.14)))

# NULL, infinity and NaN.
> -NULL
> -INFINITY
> -NAN
---
NULL
-inf
NaN

# Multiple applications work.
[expr]> ---1
[expr]> ----1
---
-1 ← Negate(Negate(Negate(Constant(Integer(1)))))
1 ← Negate(Negate(Negate(Negate(Constant(Integer(1))))))

# Bool and string fails.
!> -TRUE
!> -'a'
---
Error: invalid input: can't negate TRUE
Error: invalid input: can't negate 'a'


================================================
FILE: src/sql/testscripts/expressions/op_math_remainder
================================================
# Tests the % remainder operator.
#
# Note that remainder is not the same as modulo: the former has the sign of the
# dividend, while the latter always has a positive value.

# Integers.
[expr]> 5 % 3
[expr]> -5 % 3
[expr]> 5 % -3
---
2 ← Remainder(Constant(Integer(5)), Constant(Integer(3)))
-2 ← Remainder(Negate(Constant(Integer(5))), Constant(Integer(3)))
2 ← Remainder(Constant(Integer(5)), Negate(Constant(Integer(3))))

# Floats.
[expr]> 6.28 % 2.2
[expr]> 6.28 % -2.2
---
1.88 ← Remainder(Constant(Float(6.28)), Constant(Float(2.2)))
1.88 ← Remainder(Constant(Float(6.28)), Negate(Constant(Float(2.2))))

# Mixed.
> 3.15 % 2
> 6 % 3.15
> 3.15 % -2
---
1.15
2.85
1.15

# Division by zero.
!> 7 % 0
> 6.28 % 0.0
---
Error: invalid input: can't divide by zero
NaN

# NULLs.
> 1 % NULL
> NULL % 3
> 3.14 % NULL
> NULL % NULL
---
NULL
NULL
NULL
NULL

# Infinity and NaN.
> INFINITY % 7
> 7 % INFINITY
> 7 % -INFINITY
> INFINITY % INFINITY
> 7 % NAN
> NAN % 7
> NAN % NAN
---
NaN
7.0
7.0
NaN
NaN
NaN
NaN

# Bools and strings.
!> TRUE % FALSE
!> 'a' % 'b'
---
Error: invalid input: can't take remainder of TRUE and FALSE
Error: invalid input: can't take remainder of 'a' and 'b'


================================================
FILE: src/sql/testscripts/expressions/op_math_subtract
================================================
# Tests the - subtraction operator.

# Simple integer subtraction.
[expr]> 2 - 1
[expr]> 2 - 3
[expr]> 1 - -3 - 2
---
1 ← Subtract(Constant(Integer(2)), Constant(Integer(1)))
-1 ← Subtract(Constant(Integer(2)), Constant(Integer(3)))
2 ← Subtract(Subtract(Constant(Integer(1)), Negate(Constant(Integer(3)))), Constant(Integer(2)))

# Simple float subtraction.
[expr]> 3.1 - 2.71
[expr]> 3.1 - -2.71
---
0.3900000000000001 ← Subtract(Constant(Float(3.1)), Constant(Float(2.71)))
5.8100000000000005 ← Subtract(Constant(Float(3.1)), Negate(Constant(Float(2.71))))

# Combined int/float subtraction yields floats.
> 3.72 - 1
> 1 - 3.72
> 1 - 3.0
> -1 - 3.72
---
2.72
-2.72
-2.0
-4.720000000000001

# Subtraction with nulls yields null.
> 1 - NULL
> NULL - 3.14
> NULL - NULL
---
NULL
NULL
NULL

# Subtraction with infinity and NaN.
> 1 - INFINITY
> -1 - INFINITY
> -1 - -INFINITY
> 1 - NAN
> 3.14 - -NAN
> INFINITY - NAN
---
-inf
-inf
inf
NaN
NaN
NaN

# Overflow and underflow.
!> 9223372036854775807 - -1
!> -9223372036854775807 - 2
> 9223372036854775807 - -1.0
> -2e308 - 2e308
---
Error: invalid input: integer overflow
Error: invalid input: integer overflow
9.223372036854776e18
-inf

# Bools and strings error.
!> TRUE - FALSE
!> 'a' - 'b'
---
Error: invalid input: can't subtract TRUE and FALSE
Error: invalid input: can't subtract 'a' and 'b'

# Left-associativity.
> 5 - 3 - 1
---
1


================================================
FILE: src/sql/testscripts/expressions/op_precedence
================================================
# Tests operator precedence. Test each precedence level against the operators
# beside and immediately below it, in order. The levels are:
#
# 10: prefix +, -
# 9: postfix !
# 8: ^ (left-associative)
# 7: *, /, %
# 6: +, -
# 5: >, >=, <, <=
# 4: =, !=, LIKE, IS
# 3: NOT
# 2: AND
# 1: OR
#
# Only ^ is left-associative (and postfix operators by definition).

# Parenthesis can boost a low precedence operator (e.g. addition) above the
# highest precedence (e.g. prefix/postfix and ^).
> 1 + 2 ^ 2
> (1 + 2) ^ 2
> -1 + 2
> -(1 + 2)
> 2 + 3!
> (2 + 3)!
---
5
9
1
-3
8
120

# Prefix -.
> -3 ^ 2
> -(3 ^ 2)
---
9
-9

# Postfix !.
> 2 ^ 3!
> (2 ^ 3)!
---
64
40320

# ^, which is also left-associative.
> 2 ^ 3 ^ 2
> (2 ^ 3) ^ 2
> 2 ^ 3 * 4
> 2 ^ (3 * 4)
> 2 ^ 4 / 2
> 2 ^ (4 / 2)
> 2 ^ 5 % 2
> 2 ^ (5 % 2)
---
512
64
32
4096
8
4
0
2

# *
> 3 * 4 / 2
> 3 * (4 / 2)
> 3 * 4 % 3
> 3 * (4 % 3)
> 1 + 2 * 3
> (1 + 2) * 3
> 1 - 2 * 3
> (1 - 2) * 3
---
6
6
0
3
7
9
-5
-3

# /
> 4 / 2 * 3
> 4 / (2 * 3)
> 8 / 4 % 3
> 8 / (4 % 3)
> 2 + 4 / 2
> (2 + 4) / 2
> 4 - 2 / 2
> (4 - 2) / 2
---
6
0
2
8
4
3
3
1

# %
> 4 % 3 * 3
> 4 % (3 * 3)
> 8 % 3 / 2
> 8 % (3 / 2)
> 2 + 4 % 3
> (2 + 4) % 3
> 8 - 5 % 3
> (8 - 5) % 3
---
3
4
1
0
3
0
6
0

# +
> 1 + 2 - 3
> 1 + (2 - 3)
> 1 + 2 > 2
!> 1 + (2 > 2)
> 1 + 2 >= 2
!> 1 + (2 >= 2)
> 1 + 2 < 2
!> 1 + (2 < 2)
> 1 + 2 <= 2
!> 1 + (2 <= 2)
---
0
0
TRUE
Error: invalid input: can't add 1 and FALSE
TRUE
Error: invalid input: can't add 1 and TRUE
FALSE
Error: invalid input: can't add 1 and FALSE
FALSE
Error: invalid input: can't add 1 and TRUE

# -
> 3 - 2 + 1
> 3 - (2 + 1)
> 2 - 1 > 2
!> 2 - (1 > 2)
> 2 - 1 >= 2
!> 2 - (1 >= 2)
> 2 - 1 < 2
!> 2 - (1 < 2)
> 2 - 1 <= 2
!> 2 - (1 <= 2)
---
2
0
FALSE
Error: invalid input: can't subtract 2 and FALSE
FALSE
Error: invalid input: can't subtract 2 and FALSE
TRUE
Error: invalid input: can't subtract 2 and TRUE
TRUE
Error: invalid input: can't subtract 2 and TRUE

# >
> 5 > 3 < TRUE
!> 5 > (3 < TRUE)
> 5 > 3 <= TRUE
!> 5 > (3 <= TRUE)
> 5 > 3 > TRUE
!> 5 > (3 > TRUE)
> 5 > 3 >= TRUE
!> 5 > (3 >= TRUE)
> 5 > 3 = TRUE
!> 5 > (3 = TRUE)
> 5 > 3 != TRUE
!> 5 > (3 != TRUE)
!> 5 > 3 LIKE 'abc'
!> 5 > (3 LIKE 'abc')
> 5 > 3 IS NULL
!> 5 > (3 IS NULL)
---
FALSE
Error: invalid input: can't compare 3 and TRUE
TRUE
Error: invalid input: can't compare 3 and TRUE
FALSE
Error: invalid input: can't compare 3 and TRUE
TRUE
Error: invalid input: can't compare 3 and TRUE
TRUE
Error: invalid input: can't compare 3 and TRUE
FALSE
Error: invalid input: can't compare 3 and TRUE
Error: invalid input: can't LIKE TRUE and 'abc'
Error: invalid input: can't LIKE 3 and 'abc'
FALSE
Error: invalid input: can't compare 5 and FALSE

# >=
> 5 >= 3 < TRUE
!> 5 >= (3 < TRUE)
> 5 >= 3 <= TRUE
!> 5 >= (3 <= TRUE)
> 5 >= 3 > TRUE
!> 5 >= (3 > TRUE)
> 5 >= 3 >= TRUE
!> 5 >= (3 >= TRUE)
> 5 >= 3 = TRUE
!> 5 >= (3 = TRUE)
> 5 >= 3 != TRUE
!> 5 >= (3 != TRUE)
!> 5 >= 3 LIKE 'abc'
!> 5 >= (3 LIKE 'abc')
> 5 >= 3 IS NULL
!> 5 >= (3 IS NULL)
---
FALSE
Error: invalid input: can't compare 3 and TRUE
TRUE
Error: invalid input: can't compare 3 and TRUE
FALSE
Error: invalid input: can't compare 3 and TRUE
TRUE
Error: invalid input: can't compare 3 and TRUE
TRUE
Error: invalid input: can't compare 3 and TRUE
FALSE
Error: invalid input: can't compare 3 and TRUE
Error: invalid input: can't LIKE TRUE and 'abc'
Error: invalid input: can't LIKE 3 and 'abc'
FALSE
Error: invalid input: can't compare 5 and FALSE

# <
> 5 < 3 < TRUE
!> 5 < (3 < TRUE)
> 5 < 3 <= TRUE
!> 5 < (3 <= TRUE)
> 5 < 3 > TRUE
!> 5 < (3 > TRUE)
> 5 < 3 >= TRUE
!> 5 < (3 >= TRUE)
> 5 < 3 = TRUE
!> 5 < (3 = TRUE)
> 5 < 3 != TRUE
!> 5 < (3 != TRUE)
!> 5 < 3 LIKE 'abc'
!> 5 < (3 LIKE 'abc')
> 5 < 3 IS NULL
!> 5 < (3 IS NULL)
---
TRUE
Error: invalid input: can't compare 3 and TRUE
TRUE
Error: invalid input: can't compare 3 and TRUE
FALSE
Error: invalid input: can't compare 3 and TRUE
FALSE
Error: invalid input: can't compare 3 and TRUE
FALSE
Error: invalid input: can't compare 3 and TRUE
TRUE
Error: invalid input: can't compare 3 and TRUE
Error: invalid input: can't LIKE FALSE and 'abc'
Error: invalid input: can't LIKE 3 and 'abc'
FALSE
Error: invalid input: can't compare 5 and FALSE

# <=
> 5 <= 3 < TRUE
!> 5 <= (3 < TRUE)
> 5 <= 3 <= TRUE
!> 5 <= (3 <= TRUE)
> 5 <= 3 > TRUE
!> 5 <= (3 > TRUE)
> 5 <= 3 >= TRUE
!> 5 <= (3 >= TRUE)
> 5 <= 3 = TRUE
!> 5 <= (3 = TRUE)
> 5 <= 3 != TRUE
!> 5 <= (3 != TRUE)
!> 5 <= 3 LIKE 'abc'
!> 5 <= (3 LIKE 'abc')
> 5 <= 3 IS NULL
!> 5 <= (3 IS NULL)
---
TRUE
Error: invalid input: can't compare 3 and TRUE
TRUE
Error: invalid input: can't compare 3 and TRUE
FALSE
Error: invalid input: can't compare 3 and TRUE
FALSE
Error: invalid input: can't compare 3 and TRUE
FALSE
Error: invalid input: can't compare 3 and TRUE
TRUE
Error: invalid input: can't compare 3 and TRUE
Error: invalid input: can't LIKE FALSE and 'abc'
Error: invalid input: can't LIKE 3 and 'abc'
FALSE
Error: invalid input: can't compare 5 and FALSE

# =
> 1 = 1 != FALSE
!> 1 = (1 != FALSE)
!> 1 = 1 LIKE 'abc'
!> 1 = (1 LIKE 'abc')
> 1 = NULL IS NULL
!> 1 = (NULL IS NULL)
> NOT 1 = 1
!> (NOT 1) = 1
---
TRUE
Error: invalid input: can't compare 1 and FALSE
Error: invalid input: can't LIKE TRUE and 'abc'
Error: invalid input: can't LIKE 1 and 'abc'
TRUE
Error: invalid input: can't compare 1 and TRUE
FALSE
Error: invalid input: can't NOT 1

# !=
> 1 != 1 != FALSE
!> 1 != (1 != FALSE)
!> 1 != 1 LIKE 'abc'
!> 1 != (1 LIKE 'abc')
> 1 != NULL IS NULL
!> 1 != (NULL IS NULL)
> NOT 1 != 1
!> (NOT 1) != 1
---
FALSE
Error: invalid input: can't compare 1 and FALSE
Error: invalid input: can't LIKE FALSE and 'abc'
Error: invalid input: can't LIKE 1 and 'abc'
TRUE
Error: invalid input: can't compare 1 and TRUE
TRUE
Error: invalid input: can't NOT 1

# LIKE
> 'abc' LIKE NULL IS NULL
!> 'abc' LIKE (NULL IS NULL)
> NOT 'abc' LIKE 'abc'
!> (NOT 'abc') LIKE 'abc'
---
TRUE
Error: invalid input: can't LIKE 'abc' and TRUE
FALSE
Error: invalid input: can't NOT 'abc'

# IS NULL
> NOT NULL IS NULL
> (NOT NULL) IS NULL
---
FALSE
TRUE

# IS NOT NULL
> NOT NULL IS NOT NULL
> (NOT NULL) IS NOT NULL
---
TRUE
FALSE

# IS NAN
> NOT NAN IS NAN
!> (NOT NAN) IS NAN
---
FALSE
Error: invalid input: can't NOT NaN

# IS NOT NAN
> NOT NAN IS NOT NAN
!> (NOT NAN) IS NOT NAN
---
TRUE
Error: invalid input: can't NOT NaN

# NOT.
> NOT TRUE AND FALSE
> NOT (TRUE AND FALSE)
---
FALSE
TRUE

# AND
> FALSE AND TRUE OR TRUE
> FALSE AND (TRUE OR TRUE)
> TRUE OR TRUE AND FALSE
> (TRUE OR TRUE) AND FALSE
---
TRUE
FALSE
TRUE
FALSE

# OR has the lowest precedence, so nothing to test.


================================================
FILE: src/sql/testscripts/expressions/op_string_like
================================================
# Tests the LIKE string pattern matching operator.

# Multi-character matches.
> 'abcde' LIKE 'a%e'
> 'abcde' LIKE 'abc%'
> 'abcde' LIKE '%cde'
> 'abcde' LIKE '%'
---
TRUE
TRUE
TRUE
TRUE

# Multi-character mismatches.
> 'abcde' LIKE 'a%f'
> 'abcde' LIKE 'b%e'
> 'abcde' LIKE 'b%'
> 'abcde' LIKE '%d'
---
FALSE
FALSE
FALSE
FALSE

# Multi-character wildcards match 0 characters.
> 'abcde' LIKE 'abc%de'
> 'abcde' LIKE '%abcde'
> 'abcde' LIKE 'abcde%'
> '' LIKE '%'
---
TRUE
TRUE
TRUE
TRUE

# Single-character matches.
> 'abcde' LIKE 'ab_de'
> 'abcde' LIKE '_bcde'
> 'abcde' LIKE 'abcd_'
---
TRUE
TRUE
TRUE

# Single-character mismatches.
> 'abcde' LIKE 'ab_e'
> 'abcde' LIKE 'abc_'
> 'abcde' LIKE '_bcd'
---
FALSE
FALSE
FALSE

# Single-character wildcards require at least one match.
> 'abcde' LIKE 'abc_de'
> 'abcde' LIKE '_abcde'
> 'abcde' LIKE 'abcde_'
> '' LIKE '_'
---
FALSE
FALSE
FALSE
FALSE

# Exact matches. Submatches are not sufficient.
> 'abcde' LIKE 'abcde'
> 'abcde' LIKE 'abc'
> 'abcde' LIKE 'abcdef'
---
TRUE
FALSE
FALSE

# Patterns are case-sensitive.
> 'abcde' LIKE 'ABCDE'
> 'abcde' LIKE 'A%'
---
FALSE
FALSE

# Wildcards can be mixed and used multiple times, and % can match nothing.
> 'abcde' LIKE 'a%c%e'
> 'abcde' LIKE '%%e'
> 'abcde' LIKE '%%abcde'
> 'abcde' LIKE 'a___e'
> 'abcdefghijklmno' LIKE 'a_c%f%i_kl%m_o'
---
TRUE
TRUE
TRUE
TRUE
TRUE

# NULLs.
> NULL LIKE '%'
> NULL LIKE '_'
> 'abc' LIKE NULL
> NULL LIKE NULL
---
NULL
NULL
NULL
NULL

# * and ? are not valid patterns.
> 'abcde' LIKE 'a*e'
> 'abcde' LIKE 'ab?de'
---
FALSE
FALSE

# Fails with non-strings.
!> 'abc' LIKE 1
!> 1 LIKE 'abc'
!> 'abc' LIKE 3.14
!> 3.14 LIKE 'abc'
!> 'abc' LIKE TRUE
!> TRUE LIKE 'abc'
---
Error: invalid input: can't LIKE 'abc' and 1
Error: invalid input: can't LIKE 1 and 'abc'
Error: invalid input: can't LIKE 'abc' and 3.14
Error: invalid input: can't LIKE 3.14 and 'abc'
Error: invalid input: can't LIKE 'abc' and TRUE
Error: invalid input: can't LIKE TRUE and 'abc'


================================================
FILE: src/sql/testscripts/optimizers/constant_folder
================================================
# Tests the constant folding optimizer.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
> INSERT INTO test VALUES (1, 'a'), (2, 'b'), (3, 'c')
---
ok

# Constant folding is applied in all places where expressions are used.
[opt]> SELECT 1+1
---
Initial:
   Projection: 1 + 1
   └─ Values: blank row
ConstantFolding:
   Projection: 2
   └─ Values: blank row
2

[opt]> SELECT 1+1 FROM test
---
Initial:
   Projection: 1 + 1
   └─ Scan: test
ConstantFolding:
   Projection: 2
   └─ Scan: test
2
2
2

[opt]> SELECT * FROM test a JOIN test b ON 1+1 > 1
---
Initial:
   NestedLoopJoin: inner on 1 + 1 > 1
   ├─ Scan: test as a
   └─ Scan: test as b
ConstantFolding:
   NestedLoopJoin: inner on TRUE
   ├─ Scan: test as a
   └─ Scan: test as b
FilterPushdown:
   NestedLoopJoin: inner
   ├─ Scan: test as a (TRUE)
   └─ Scan: test as b (TRUE)
ShortCircuit:
   NestedLoopJoin: inner
   ├─ Scan: test as a
   └─ Scan: test as b
1, 'a', 1, 'a'
1, 'a', 2, 'b'
1, 'a', 3, 'c'
2, 'b', 1, 'a'
2, 'b', 2, 'b'
2, 'b', 3, 'c'
3, 'c', 1, 'a'
3, 'c', 2, 'b'
3, 'c', 3, 'c'

[opt]> SELECT * FROM test WHERE 1+1 > 1
---
Initial:
   Filter: 1 + 1 > 1
   └─ Scan: test
ConstantFolding:
   Filter: TRUE
   └─ Scan: test
FilterPushdown:
   Scan: test (TRUE)
ShortCircuit:
   Scan: test
1, 'a'
2, 'b'
3, 'c'

[opt]> SELECT * FROM test ORDER BY 1+1
---
Initial:
   Order: 1 + 1 asc
   └─ Scan: test
ConstantFolding:
   Order: 2 asc
   └─ Scan: test
1, 'a'
2, 'b'
3, 'c'

[opt]> SELECT * FROM test LIMIT 1+1
---
Initial:
   Limit: 2
   └─ Scan: test
1, 'a'
2, 'b'

[opt]> SELECT * FROM test OFFSET 1+1
---
Initial:
   Offset: 2
   └─ Scan: test
3, 'c'

# Constant folding folds the constant parts of a variable expression.
# TODO: this should fold 4 - 6, but it needs to reorder operations.
[opt]> SELECT 2 * 2 + id - 3 * 2 FROM test
---
Initial:
   Projection: 2 * 2 + test.id - 3 * 2
   └─ Scan: test
ConstantFolding:
   Projection: 4 + test.id - 6
   └─ Scan: test
-1
0
1

# Constant folding short-circuits variable logical operations.
[opt]> SELECT * FROM test WHERE 1+1 > 1 OR id > 1
---
Initial:
   Filter: 1 + 1 > 1 OR test.id > 1
   └─ Scan: test
ConstantFolding:
   Filter: TRUE
   └─ Scan: test
FilterPushdown:
   Scan: test (TRUE)
ShortCircuit:
   Scan: test
1, 'a'
2, 'b'
3, 'c'

[opt]> SELECT * FROM test WHERE 1+1 < 1 OR id > 1
---
Initial:
   Filter: 1 + 1 < 1 OR test.id > 1
   └─ Scan: test
ConstantFolding:
   Filter: test.id > 1
   └─ Scan: test
FilterPushdown:
   Scan: test (test.id > 1)
2, 'b'
3, 'c'

[opt]> SELECT * FROM test WHERE 1+1 > 1 AND id > 1
---
Initial:
   Filter: 1 + 1 > 1 AND test.id > 1
   └─ Scan: test
ConstantFolding:
   Filter: test.id > 1
   └─ Scan: test
FilterPushdown:
   Scan: test (test.id > 1)
2, 'b'
3, 'c'

[opt]> SELECT * FROM test WHERE 1+1 < 1 AND id > 1
---
Initial:
   Filter: 1 + 1 < 1 AND test.id > 1
   └─ Scan: test
ConstantFolding:
   Filter: FALSE
   └─ Scan: test
FilterPushdown:
   Scan: test (FALSE)
ShortCircuit:
   Nothing


================================================
FILE: src/sql/testscripts/optimizers/filter_pushdown
================================================
# Tests filter pushdown.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
> INSERT INTO test VALUES (1, 'a'), (2, 'b'), (3, 'c')
---
ok

# Pushes WHERE filters into Scan nodes.
[opt]> SELECT * FROM test WHERE value = 'b'
---
Initial:
   Filter: test.value = 'b'
   └─ Scan: test
FilterPushdown:
   Scan: test (test.value = 'b')
2, 'b'

# HAVING filters are not pushed past aggregate nodes, even when possible. The
# optimizer could do this if it was smarter.
[opt]> SELECT id, value FROM test GROUP BY id, value HAVING value = 'b'
---
Initial:
   Filter: test.value = 'b'
   └─ Projection: test.id, test.value
      └─ Aggregate: test.id, test.value
         └─ Scan: test
ShortCircuit:
   Filter: test.value = 'b'
   └─ Aggregate: test.id, test.value
      └─ Scan: test
2, 'b'

# Pushes down independent predicates from JOIN nodes.
[opt]> SELECT * FROM test a JOIN test b ON a.value = 'a' AND b.value = 'b'
---
Initial:
   NestedLoopJoin: inner on a.value = 'a' AND b.value = 'b'
   ├─ Scan: test as a
   └─ Scan: test as b
FilterPushdown:
   NestedLoopJoin: inner
   ├─ Scan: test as a (a.value = 'a')
   └─ Scan: test as b (b.value = 'b')
1, 'a', 2, 'b'

# Pushes down independent predicates from JOIN nodes, even when there
# are also dependent predicates.
[opt]> SELECT * FROM test a JOIN test b ON a.id = b.id AND a.value = 'a' AND b.value = 'b'
---
Initial:
   NestedLoopJoin: inner on a.id = b.id AND a.value = 'a' AND b.value = 'b'
   ├─ Scan: test as a
   └─ Scan: test as b
FilterPushdown:
   NestedLoopJoin: inner on a.id = b.id
   ├─ Scan: test as a (a.value = 'a')
   └─ Scan: test as b (b.value = 'b')
HashJoin:
   HashJoin: inner on a.id = b.id
   ├─ Scan: test as a (a.value = 'a')
   └─ Scan: test as b (b.value = 'b')

# Does not push down JOIN node OR predicates.
[opt]> SELECT * FROM test a JOIN test b ON a.value = 'a' OR b.value = 'b'
---
Initial:
   NestedLoopJoin: inner on a.value = 'a' OR b.value = 'b'
   ├─ Scan: test as a
   └─ Scan: test as b
1, 'a', 1, 'a'
1, 'a', 2, 'b'
1, 'a', 3, 'c'
2, 'b', 2, 'b'
3, 'c', 2, 'b'

# Pushes WHERE predicates down into and past JOIN nodes.
[opt]> SELECT * FROM test a JOIN test b ON a.id = b.id WHERE a.value = 'a' AND b.value = 'b'
---
Initial:
   Filter: a.value = 'a' AND b.value = 'b'
   └─ NestedLoopJoin: inner on a.id = b.id
      ├─ Scan: test as a
      └─ Scan: test as b
FilterPushdown:
   NestedLoopJoin: inner on a.id = b.id
   ├─ Scan: test as a (a.value = 'a')
   └─ Scan: test as b (b.value = 'b')
HashJoin:
   HashJoin: inner on a.id = b.id
   ├─ Scan: test as a (a.value = 'a')
   └─ Scan: test as b (b.value = 'b')

# Pushes down the parts of predicates that can be pushed.
[opt]> SELECT * FROM test a JOIN test b ON a.id = b.id WHERE a.value = 'a' AND b.value = 'b' AND (a.id > 0 OR b.id > 0)
---
Initial:
   Filter: a.value = 'a' AND b.value = 'b' AND (a.id > 0 OR b.id > 0)
   └─ NestedLoopJoin: inner on a.id = b.id
      ├─ Scan: test as a
      └─ Scan: test as b
FilterPushdown:
   NestedLoopJoin: inner on (a.id > 0 OR b.id > 0) AND a.id = b.id
   ├─ Scan: test as a (a.value = 'a')
   └─ Scan: test as b (b.value = 'b')

# Equijoin pushdowns can transfer lookups from one relation to the other to make
# use of indexes.
[opt]> SELECT * FROM test a JOIN test b ON a.id = b.id WHERE (a.id = 1 OR a.id = 2)
---
Initial:
   Filter: a.id = 1 OR a.id = 2
   └─ NestedLoopJoin: inner on a.id = b.id
      ├─ Scan: test as a
      └─ Scan: test as b
FilterPushdown:
   NestedLoopJoin: inner on a.id = b.id
   ├─ Scan: test as a (a.id = 1 OR a.id = 2)
   └─ Scan: test as b (b.id = 1 OR b.id = 2)
IndexLookup:
   NestedLoopJoin: inner on a.id = b.id
   ├─ KeyLookup: test as a (1, 2)
   └─ KeyLookup: test as b (1, 2)
HashJoin:
   HashJoin: inner on a.id = b.id
   ├─ KeyLookup: test as a (1, 2)
   └─ KeyLookup: test as b (1, 2)
1, 'a', 1, 'a'
2, 'b', 2, 'b'

[opt]> SELECT * FROM test a JOIN test b ON a.id = b.id WHERE (b.id = 1 OR b.id = 2)
---
Initial:
   Filter: b.id = 1 OR b.id = 2
   └─ NestedLoopJoin: inner on a.id = b.id
      ├─ Scan: test as a
      └─ Scan: test as b
FilterPushdown:
   NestedLoopJoin: inner on a.id = b.id
   ├─ Scan: test as a (a.id = 1 OR a.id = 2)
   └─ Scan: test as b (b.id = 1 OR b.id = 2)
IndexLookup:
   NestedLoopJoin: inner on a.id = b.id
   ├─ KeyLookup: test as a (1, 2)
   └─ KeyLookup: test as b (1, 2)
HashJoin:
   HashJoin: inner on a.id = b.id
   ├─ KeyLookup: test as a (1, 2)
   └─ KeyLookup: test as b (1, 2)
1, 'a', 1, 'a'
2, 'b', 2, 'b'

# Pushdowns can propagate through multiple JOIN nodes.
[opt]> SELECT * FROM \
    test a JOIN test b ON a.id = b.id JOIN test c ON b.id = c.id JOIN test d ON c.id = d.id \
    WHERE a.id > 0 AND b.id = 2 AND c.id < 3 AND (d.id = 2 OR d.id = 3)
---
Initial:
   Filter: a.id > 0 AND b.id = 2 AND c.id < 3 AND (d.id = 2 OR d.id = 3)
   └─ NestedLoopJoin: inner on c.id = d.id
      ├─ NestedLoopJoin: inner on b.id = c.id
      │  ├─ NestedLoopJoin: inner on a.id = b.id
      │  │  ├─ Scan: test as a
      │  │  └─ Scan: test as b
      │  └─ Scan: test as c
      └─ Scan: test as d
FilterPushdown:
   NestedLoopJoin: inner on c.id = d.id
   ├─ NestedLoopJoin: inner on b.id = c.id
   │  ├─ NestedLoopJoin: inner on a.id = b.id
   │  │  ├─ Scan: test as a (a.id > 0 AND (a.id = 2 OR a.id = 3))
   │  │  └─ Scan: test as b (b.id = 2 AND (b.id = 2 OR b.id = 3))
   │  └─ Scan: test as c (c.id < 3 AND (c.id = 2 OR c.id = 3) AND c.id = 2)
   └─ Scan: test as d (d.id = 2 OR d.id = 3)
IndexLookup:
   NestedLoopJoin: inner on c.id = d.id
   ├─ NestedLoopJoin: inner on b.id = c.id
   │  ├─ NestedLoopJoin: inner on a.id = b.id
   │  │  ├─ Filter: a.id > 0
   │  │  │  └─ KeyLookup: test as a (2, 3)
   │  │  └─ Filter: b.id = 2 OR b.id = 3
   │  │     └─ KeyLookup: test as b (2)
   │  └─ Filter: c.id < 3 AND c.id = 2
   │     └─ KeyLookup: test as c (2, 3)
   └─ KeyLookup: test as d (2, 3)
HashJoin:
   HashJoin: inner on c.id = d.id
   ├─ HashJoin: inner on b.id = c.id
   │  ├─ HashJoin: inner on a.id = b.id
   │  │  ├─ Filter: a.id > 0
   │  │  │  └─ KeyLookup: test as a (2, 3)
   │  │  └─ Filter: b.id = 2 OR b.id = 3
   │  │     └─ KeyLookup: test as b (2)
   │  └─ Filter: c.id < 3 AND c.id = 2
   │     └─ KeyLookup: test as c (2, 3)
   └─ KeyLookup: test as d (2, 3)
2, 'b', 2, 'b', 2, 'b', 2, 'b'


================================================
FILE: src/sql/testscripts/optimizers/hash_join
================================================
# Tests the switch to hash joins where appropriate.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
> INSERT INTO test VALUES (1, 'a'), (2, 'b'), (3, 'c')
---
ok

# Equijoins are converted to hash joins.
[opt]> SELECT * FROM test a JOIN test b ON a.id = b.id
---
Initial:
   NestedLoopJoin: inner on a.id = b.id
   ├─ Scan: test as a
   └─ Scan: test as b
HashJoin:
   HashJoin: inner on a.id = b.id
   ├─ Scan: test as a
   └─ Scan: test as b
1, 'a', 1, 'a'
2, 'b', 2, 'b'
3, 'c', 3, 'c'

# This also works for non-primary key columns.
[opt]> SELECT * FROM test a JOIN test b ON a.value = b.value
---
Initial:
   NestedLoopJoin: inner on a.value = b.value
   ├─ Scan: test as a
   └─ Scan: test as b
HashJoin:
   HashJoin: inner on a.value = b.value
   ├─ Scan: test as a
   └─ Scan: test as b
1, 'a', 1, 'a'
2, 'b', 2, 'b'
3, 'c', 3, 'c'

# However, it does not work with both.
[opt]> SELECT * FROM test a JOIN test b ON a.id = b.id AND a.value = b.value 
---
Initial:
   NestedLoopJoin: inner on a.id = b.id AND a.value = b.value
   ├─ Scan: test as a
   └─ Scan: test as b
1, 'a', 1, 'a'
2, 'b', 2, 'b'
3, 'c', 3, 'c'

# It does not work with other predicates either. A smarter optimizer could
# move the rest of the predicate into a new filter node.
[opt]> SELECT * FROM test a JOIN test b ON a.id = b.id AND a.value >= b.value
---
Initial:
   NestedLoopJoin: inner on a.id = b.id AND (a.value > b.value OR a.value = b.value)
   ├─ Scan: test as a
   └─ Scan: test as b
1, 'a', 1, 'a'
2, 'b', 2, 'b'
3, 'c', 3, 'c'

# However, the filter pushdown optimizer can save the day by pushing down
# independent predicates into the scans.
[opt]> SELECT * FROM test a JOIN test b ON a.id = b.id AND a.value = 'b' AND b.value = 'c'
---
Initial:
   NestedLoopJoin: inner on a.id = b.id AND a.value = 'b' AND b.value = 'c'
   ├─ Scan: test as a
   └─ Scan: test as b
FilterPushdown:
   NestedLoopJoin: inner on a.id = b.id
   ├─ Scan: test as a (a.value = 'b')
   └─ Scan: test as b (b.value = 'c')
HashJoin:
   HashJoin: inner on a.id = b.id
   ├─ Scan: test as a (a.value = 'b')
   └─ Scan: test as b (b.value = 'c')


================================================
FILE: src/sql/testscripts/optimizers/index_lookup
================================================
# Tests the index_lookup optimizer.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING INDEX, "float" FLOAT INDEX)
> INSERT INTO test VALUES (0, NULL), (1, 'a', 3.14), (2, 'b', NAN), (3, 'c', 0.0)
> CREATE TABLE other (id INT PRIMARY KEY, test_id INT REFERENCES test)
> INSERT INTO other VALUES (1, 1), (2, 2), (3, 3)
---
ok

# Primary key lookups.
[opt]> SELECT * FROM test WHERE id = 2
---
Initial:
   Filter: test.id = 2
   └─ Scan: test
FilterPushdown:
   Scan: test (test.id = 2)
IndexLookup:
   KeyLookup: test (2)
2, 'b', NaN

[opt]> SELECT * FROM test WHERE id = 1 OR id = 3
---
Initial:
   Filter: test.id = 1 OR test.id = 3
   └─ Scan: test
FilterPushdown:
   Scan: test (test.id = 1 OR test.id = 3)
IndexLookup:
   KeyLookup: test (1, 3)
1, 'a', 3.14
3, 'c', 0.0

# Can combine lookups with other predicates, but only AND.
[opt]> SELECT * FROM test WHERE (id = 1 OR id = 3) AND value > 'a'
---
Initial:
   Filter: (test.id = 1 OR test.id = 3) AND test.value > 'a'
   └─ Scan: test
FilterPushdown:
   Scan: test ((test.id = 1 OR test.id = 3) AND test.value > 'a')
IndexLookup:
   Filter: test.value > 'a'
   └─ KeyLookup: test (1, 3)
3, 'c', 0.0

[opt]> SELECT * FROM test WHERE id = 1 OR id = 3 OR value > 'a'
---
Initial:
   Filter: test.id = 1 OR test.id = 3 OR test.value > 'a'
   └─ Scan: test
FilterPushdown:
   Scan: test (test.id = 1 OR test.id = 3 OR test.value > 'a')
1, 'a', 3.14
2, 'b', NaN
3, 'c', 0.0

# Same story with secondary indexes.
[opt]> SELECT * FROM test WHERE value = 'b'
---
Initial:
   Filter: test.value = 'b'
   └─ Scan: test
FilterPushdown:
   Scan: test (test.value = 'b')
IndexLookup:
   IndexLookup: test.value ('b')
2, 'b', NaN

[opt]> SELECT * FROM test WHERE value = 'a' OR value = 'c'
---
Initial:
   Filter: test.value = 'a' OR test.value = 'c'
   └─ Scan: test
FilterPushdown:
   Scan: test (test.value = 'a' OR test.value = 'c')
IndexLookup:
   IndexLookup: test.value ('a', 'c')
1, 'a', 3.14
3, 'c', 0.0

[opt]> SELECT * FROM test WHERE (value = 'a' OR value = 'c') AND id > 1
---
Initial:
   Filter: (test.value = 'a' OR test.value = 'c') AND test.id > 1
   └─ Scan: test
FilterPushdown:
   Scan: test ((test.value = 'a' OR test.value = 'c') AND test.id > 1)
IndexLookup:
   Filter: test.id > 1
   └─ IndexLookup: test.value ('a', 'c')
3, 'c', 0.0

[opt]> SELECT * FROM test WHERE value = 'a' OR value = 'c' OR id > 1
---
Initial:
   Filter: test.value = 'a' OR test.value = 'c' OR test.id > 1
   └─ Scan: test
FilterPushdown:
   Scan: test (test.value = 'a' OR test.value = 'c' OR test.id > 1)
1, 'a', 3.14
2, 'b', NaN
3, 'c', 0.0

# NULL lookups should match for IS NULL, but not for = NULL. IS NOT NULL
# incurs a table scan.
[opt]> SELECT * FROM test WHERE value IS NULL
---
Initial:
   Filter: test.value IS NULL
   └─ Scan: test
FilterPushdown:
   Scan: test (test.value IS NULL)
IndexLookup:
   IndexLookup: test.value (NULL)
0, NULL, NULL

[opt]> SELECT * FROM test WHERE value = NULL
---
Initial:
   Filter: test.value = NULL
   └─ Scan: test
FilterPushdown:
   Scan: test (test.value = NULL)
IndexLookup:
   IndexLookup: test.value (0 values)
ShortCircuit:
   Nothing

[opt]> SELECT * FROM test WHERE value != NULL
---
Initial:
   Filter: NOT test.value = NULL
   └─ Scan: test
FilterPushdown:
   Scan: test (NOT test.value = NULL)

[opt]> SELECT * FROM test WHERE value IS NOT NULL
---
Initial:
   Filter: NOT test.value IS NULL
   └─ Scan: test
FilterPushdown:
   Scan: test (NOT test.value IS NULL)
1, 'a', 3.14
2, 'b', NaN
3, 'c', 0.0

# NAN lookups should be treated similarly to NULLs.
[opt]> SELECT * FROM test WHERE "float" IS NAN
---
Initial:
   Filter: test.float IS NAN
   └─ Scan: test
FilterPushdown:
   Scan: test (test.float IS NAN)
IndexLookup:
   IndexLookup: test.float (NaN)
2, 'b', NaN

[opt]> SELECT * FROM test WHERE "float" = NAN
---
Initial:
   Filter: test.float = NaN
   └─ Scan: test
FilterPushdown:
   Scan: test (test.float = NaN)
IndexLookup:
   IndexLookup: test.float (0 values)
ShortCircuit:
   Nothing

[opt]> SELECT * FROM test WHERE "float" = -NAN
---
Initial:
   Filter: test.float = -NaN
   └─ Scan: test
ConstantFolding:
   Filter: test.float = NaN
   └─ Scan: test
FilterPushdown:
   Scan: test (test.float = NaN)
IndexLookup:
   IndexLookup: test.float (0 values)
ShortCircuit:
   Nothing

# NB: NAN != NAN, so this should return row 2. This is unlike NULL, where NULL
# != NULL yields NULL rather than true.
[opt]> SELECT * FROM test WHERE "float" != NAN
---
Initial:
   Filter: NOT test.float = NaN
   └─ Scan: test
FilterPushdown:
   Scan: test (NOT test.float = NaN)
1, 'a', 3.14
2, 'b', NaN
3, 'c', 0.0

[opt]> SELECT * FROM test WHERE "float" IS NOT NAN
---
Initial:
   Filter: NOT test.float IS NAN
   └─ Scan: test
FilterPushdown:
   Scan: test (NOT test.float IS NAN)
1, 'a', 3.14
3, 'c', 0.0

# Inner joins on foreign keys with index lookups are transferred across.
[opt]> SELECT * FROM test JOIN other ON other.test_id = test.id WHERE test_id = 1 OR test_id = 3
---
Initial:
   Filter: other.test_id = 1 OR other.test_id = 3
   └─ NestedLoopJoin: inner on other.test_id = test.id
      ├─ Scan: test
      └─ Scan: other
FilterPushdown:
   NestedLoopJoin: inner on other.test_id = test.id
   ├─ Scan: test (test.id = 1 OR test.id = 3)
   └─ Scan: other (other.test_id = 1 OR other.test_id = 3)
IndexLookup:
   NestedLoopJoin: inner on other.test_id = test.id
   ├─ KeyLookup: test (1, 3)
   └─ IndexLookup: other.test_id (1, 3)
HashJoin:
   HashJoin: inner on test.id = other.test_id
   ├─ KeyLookup: test (1, 3)
   └─ IndexLookup: other.test_id (1, 3)
1, 'a', 3.14, 1, 1
3, 'c', 0.0, 3, 3

# It's the same if the index lookups are given in the join predicate.
[opt]> SELECT * FROM test JOIN other ON other.test_id = test.id AND (test_id = 1 OR test_id = 3)
---
Initial:
   NestedLoopJoin: inner on other.test_id = test.id AND (other.test_id = 1 OR other.test_id = 3)
   ├─ Scan: test
   └─ Scan: other
FilterPushdown:
   NestedLoopJoin: inner on other.test_id = test.id
   ├─ Scan: test (test.id = 1 OR test.id = 3)
   └─ Scan: other (other.test_id = 1 OR other.test_id = 3)
IndexLookup:
   NestedLoopJoin: inner on other.test_id = test.id
   ├─ KeyLookup: test (1, 3)
   └─ IndexLookup: other.test_id (1, 3)
HashJoin:
   HashJoin: inner on test.id = other.test_id
   ├─ KeyLookup: test (1, 3)
   └─ IndexLookup: other.test_id (1, 3)
1, 'a', 3.14, 1, 1
3, 'c', 0.0, 3, 3

# It also works with three tables.
[opt]> SELECT * FROM test \
    JOIN other a ON a.test_id = test.id AND a.test_id = 2 \
    JOIN other b ON b.test_id = test.id AND b.test_id = 1 OR b.test_id = 3
---
Initial:
   NestedLoopJoin: inner on b.test_id = test.id AND b.test_id = 1 OR b.test_id = 3
   ├─ NestedLoopJoin: inner on a.test_id = test.id AND a.test_id = 2
   │  ├─ Scan: test
   │  └─ Scan: other as a
   └─ Scan: other as b
FilterPushdown:
   NestedLoopJoin: inner on b.test_id = test.id OR b.test_id = 3
   ├─ NestedLoopJoin: inner on a.test_id = test.id
   │  ├─ Scan: test (test.id = 2)
   │  └─ Scan: other as a (a.test_id = 2)
   └─ Scan: other as b (b.test_id = 1 OR b.test_id = 3)
IndexLookup:
   NestedLoopJoin: inner on b.test_id = test.id OR b.test_id = 3
   ├─ NestedLoopJoin: inner on a.test_id = test.id
   │  ├─ KeyLookup: test (2)
   │  └─ IndexLookup: other.test_id as a.test_id (2)
   └─ IndexLookup: other.test_id as b.test_id (1, 3)
HashJoin:
   NestedLoopJoin: inner on b.test_id = test.id OR b.test_id = 3
   ├─ HashJoin: inner on test.id = a.test_id
   │  ├─ KeyLookup: test (2)
   │  └─ IndexLookup: other.test_id as a.test_id (2)
   └─ IndexLookup: other.test_id as b.test_id (1, 3)
2, 'b', NaN, 2, 2, 3, 3


================================================
FILE: src/sql/testscripts/optimizers/short_circuit
================================================
# Tests the short circuiting optimizer.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
> INSERT INTO test VALUES (1, 'a'), (2, 'b'), (3, 'c')
> CREATE TABLE ref (id INT PRIMARY KEY, test_id INT REFERENCES test)
> INSERT INTO ref VALUES (1, 1), (2, 2), (3, 3)
---
ok

# TRUE predicates are removed.
[opt]> SELECT * FROM test WHERE TRUE
---
Initial:
   Filter: TRUE
   └─ Scan: test
FilterPushdown:
   Scan: test (TRUE)
ShortCircuit:
   Scan: test
1, 'a'
2, 'b'
3, 'c'

[opt]> SELECT 1, 2, 3 WHERE TRUE
---
Initial:
   Projection: 1, 2, 3
   └─ Filter: TRUE
      └─ Values: blank row
ShortCircuit:
   Projection: 1, 2, 3
   └─ Values: blank row
1, 2, 3

[opt]> SELECT * FROM test JOIN ref ON TRUE
---
Initial:
   NestedLoopJoin: inner on TRUE
   ├─ Scan: test
   └─ Scan: ref
FilterPushdown:
   NestedLoopJoin: inner
   ├─ Scan: test (TRUE)
   └─ Scan: ref (TRUE)
ShortCircuit:
   NestedLoopJoin: inner
   ├─ Scan: test
   └─ Scan: ref
1, 'a', 1, 1
1, 'a', 2, 2
1, 'a', 3, 3
2, 'b', 1, 1
2, 'b', 2, 2
2, 'b', 3, 3
3, 'c', 1, 1
3, 'c', 2, 2
3, 'c', 3, 3

# FALSE predicates → Nothing (but retains column headers)
[opt,header]> SELECT * FROM test WHERE FALSE
---
Initial:
   Filter: FALSE
   └─ Scan: test
FilterPushdown:
   Scan: test (FALSE)
ShortCircuit:
   Nothing
test.id, test.value

[opt,header]> SELECT 1, 2, 3 WHERE FALSE
---
Initial:
   Projection: 1, 2, 3
   └─ Filter: FALSE
      └─ Values: blank row
ShortCircuit:
   Nothing
, , 

[opt,header]> SELECT * FROM test JOIN ref ON ref.test_id = test.id AND FALSE
---
Initial:
   NestedLoopJoin: inner on ref.test_id = test.id AND FALSE
   ├─ Scan: test
   └─ Scan: ref
ConstantFolding:
   NestedLoopJoin: inner on FALSE
   ├─ Scan: test
   └─ Scan: ref
FilterPushdown:
   NestedLoopJoin: inner
   ├─ Scan: test (FALSE)
   └─ Scan: ref (FALSE)
ShortCircuit:
   Nothing
test.id, test.value, ref.id, ref.test_id

# NULL predicates → Nothing
[opt,header]> SELECT * FROM test WHERE NULL
---
Initial:
   Filter: NULL
   └─ Scan: test
FilterPushdown:
   Scan: test (NULL)
ShortCircuit:
   Nothing
test.id, test.value

[opt,header]> SELECT 1, 2, 3 WHERE NULL
---
Initial:
   Projection: 1, 2, 3
   └─ Filter: NULL
      └─ Values: blank row
ShortCircuit:
   Nothing
, , 

[opt,header]> SELECT * FROM test JOIN ref ON ref.test_id = test.id AND NULL
---
Initial:
   NestedLoopJoin: inner on ref.test_id = test.id AND NULL
   ├─ Scan: test
   └─ Scan: ref
FilterPushdown:
   NestedLoopJoin: inner on ref.test_id = test.id
   ├─ Scan: test (NULL)
   └─ Scan: ref (NULL)
HashJoin:
   HashJoin: inner on test.id = ref.test_id
   ├─ Scan: test (NULL)
   └─ Scan: ref (NULL)
ShortCircuit:
   Nothing
test.id, test.value, ref.id, ref.test_id

# Empty key/index lookups → Nothing
[opt,header]> SELECT * FROM test WHERE id = NULL
---
Initial:
   Filter: test.id = NULL
   └─ Scan: test
FilterPushdown:
   Scan: test (test.id = NULL)
IndexLookup:
   KeyLookup: test (0 keys)
ShortCircuit:
   Nothing
test.id, test.value

[opt,header]> SELECT * FROM ref WHERE test_id = NULL
---
Initial:
   Filter: ref.test_id = NULL
   └─ Scan: ref
FilterPushdown:
   Scan: ref (ref.test_id = NULL)
IndexLookup:
   IndexLookup: ref.test_id (0 values)
ShortCircuit:
   Nothing
ref.id, ref.test_id

# LIMIT 0 → Nothing
[opt,header]> SELECT * FROM test LIMIT 0
---
Initial:
   Limit: 0
   └─ Scan: test
ShortCircuit:
   Nothing
test.id, test.value

# Remove projections that simply pass through source columns. Aliased
# column names are retained.
[opt]> SELECT id, value FROM test
---
Initial:
   Projection: test.id, test.value
   └─ Scan: test
ShortCircuit:
   Scan: test
1, 'a'
2, 'b'
3, 'c'

[opt,header]> SELECT id AS foo, value AS bar FROM test
---
Initial:
   Projection: test.id as foo, test.value as bar
   └─ Scan: test
foo, bar
1, 'a'
2, 'b'
3, 'c'

[opt]> SELECT id, MIN(id), MAX(id) FROM test GROUP BY id
---
Initial:
   Projection: test.id, #1, #2
   └─ Aggregate: test.id, min(test.id), max(test.id)
      └─ Scan: test
ShortCircuit:
   Aggregate: test.id, min(test.id), max(test.id)
   └─ Scan: test
1, 1, 1
2, 2, 2
3, 3, 3

# Constant folding happens before short-circuiting.
[opt]> SELECT * FROM test WHERE 1 != 1 OR 0 > 3 AND NOT NULL
---
Initial:
   Filter: NOT 1 = 1 OR 0 > 3 AND NOT NULL
   └─ Scan: test
ConstantFolding:
   Filter: FALSE
   └─ Scan: test
FilterPushdown:
   Scan: test (FALSE)
ShortCircuit:
   Nothing


================================================
FILE: src/sql/testscripts/queries/aggregate
================================================
# Tests aggregate functions.

> CREATE TABLE test ( \
    id INT PRIMARY KEY, \
    "bool" BOOLEAN, \
    "int" INTEGER, \
    "float" FLOAT, \
    "string" STRING \
)
> INSERT INTO test VALUES (0, NULL, NULL,   NULL,      NULL)
> INSERT INTO test VALUES (1, TRUE,   -1,   3.14,      '')
> INSERT INTO test VALUES (2, FALSE,  0,    2.718,     'abc')
> INSERT INTO test VALUES (3, TRUE,   3,    -0.0,      'AB')
> INSERT INTO test VALUES (4, NULL,   42,   INFINITY,  '👋')
> INSERT INTO test VALUES (5, NULL,   NULL, NAN,       NULL)
---
ok

# COUNT(*) returns the row count.
[plan]> SELECT COUNT(*) FROM test
---
Aggregate: count(TRUE)
└─ Scan: test
6

# COUNT works on constant values.
[plan,header]> SELECT COUNT(NULL), COUNT(TRUE), COUNT(1), COUNT(3.14), COUNT(NAN), COUNT('')
---
Aggregate: count(NULL), count(TRUE), count(1), count(3.14), count(NaN), count('')
└─ Values: blank row
, , , , , 
0, 1, 1, 1, 1, 1

# COUNT works on no rows.
[plan]> SELECT COUNT(id), COUNT("bool"), COUNT("float"), COUNT("string") FROM test WHERE false
---
Aggregate: count(test.id), count(test.bool), count(test.float), count(test.string)
└─ Nothing
0, 0, 0, 0

# COUNT returns number of non-NULL values.
[plan,header]> SELECT COUNT(id), COUNT("bool"), COUNT("float"), COUNT("string") FROM test
---
Aggregate: count(test.id), count(test.bool), count(test.float), count(test.string)
└─ Scan: test
, , , 
6, 3, 5, 4

# MAX works on constant values.
[plan]> SELECT MAX(NULL), MAX(TRUE), MAX(1), MAX(3.14), MAX(NAN), MAX('foo') FROM test
---
Aggregate: max(NULL), max(TRUE), max(1), max(3.14), max(NaN), max('foo')
└─ Scan: test
NULL, TRUE, 1, 3.14, NaN, 'foo'

# MAX works on no rows.
[plan]> SELECT MAX(id), MAX("bool"), MAX("float"), MAX("string") FROM test WHERE false
---
Aggregate: max(test.id), max(test.bool), max(test.float), max(test.string)
└─ Nothing
NULL, NULL, NULL, NULL

# MAX returns the max value, or NULL if any value is NULL.
[plan]> SELECT MAX(id) FROM test
---
Aggregate: max(test.id)
└─ Scan: test
5

> SELECT MAX("bool") FROM test
---
TRUE

> SELECT MAX("int") FROM test
---
42

> SELECT MAX("float") FROM test
> SELECT MAX("float") FROM test WHERE "float" IS NOT NAN
---
NaN
inf

> SELECT MAX("string") FROM test
---
'👋'

# MIN works on constant values.
[plan]> SELECT MIN(NULL), MIN(TRUE), MIN(1), MIN(3.14), MIN(NAN), MIN('foo') FROM test
---
Aggregate: min(NULL), min(TRUE), min(1), min(3.14), min(NaN), min('foo')
└─ Scan: test
NULL, TRUE, 1, 3.14, NaN, 'foo'

# MIN works on no rows.
[plan]> SELECT MIN(id), MIN("bool"), MIN("float"), MIN("string") FROM test WHERE false
---
Aggregate: min(test.id), min(test.bool), min(test.float), min(test.string)
└─ Nothing
NULL, NULL, NULL, NULL

# MIN returns the min value, or NULL if any value is NULL.
[plan]> SELECT MIN(id) FROM test
---
Aggregate: min(test.id)
└─ Scan: test
0

> SELECT MIN("bool") FROM test
---
FALSE

> SELECT MIN("int") FROM test
---
-1

> SELECT MIN("float") FROM test
---
0.0

> SELECT MIN("string") FROM test
---
''

# SUM works on constant values, but only numbers.
[plan]> SELECT SUM(NULL), SUM(1), SUM(3.14), SUM(NAN) FROM test
---
Aggregate: sum(NULL), sum(1), sum(3.14), sum(NaN)
└─ Scan: test
NULL, 6, 18.84, NaN

!> SELECT SUM(TRUE)
!> SELECT SUM('foo')
---
Error: invalid input: can't add 0 and TRUE
Error: invalid input: can't add 0 and 'foo'

# SUM works on no rows.
[plan]> SELECT SUM(id), SUM("bool"), SUM("float"), SUM("string") FROM test WHERE false
---
Aggregate: sum(test.id), sum(test.bool), sum(test.float), sum(test.string)
└─ Nothing
NULL, NULL, NULL, NULL

# SUM returns the sum, or NULL if any value is NULL. Errors
# on booleans or strings.
[plan]> SELECT SUM(id) FROM test
---
Aggregate: sum(test.id)
└─ Scan: test
15

!> SELECT SUM("bool") FROM test
---
Error: invalid input: can't add 0 and TRUE

> SELECT SUM("int") FROM test
---
44

> SELECT SUM("float") FROM test
> SELECT SUM("float") FROM test WHERE "float" IS NOT NAN
---
NaN
inf

!> SELECT SUM("string") FROM test
---
Error: invalid input: can't add 0 and ''

# AVG works on constant values, but only numbers.
[plan]> SELECT AVG(NULL), AVG(1), AVG(3.14), AVG(NAN) FROM test
---
Aggregate: avg(NULL), avg(1), avg(3.14), avg(NaN)
└─ Scan: test
NULL, 1, 3.14, NaN

!> SELECT AVG(TRUE)
!> SELECT AVG('foo')
---
Error: invalid input: can't add 0 and TRUE
Error: invalid input: can't add 0 and 'foo'

# AVG works on no rows.
[plan]> SELECT AVG(id), AVG("bool"), AVG("float"), AVG("string") FROM test WHERE false
---
Aggregate: avg(test.id), avg(test.bool), avg(test.float), avg(test.string)
└─ Nothing
NULL, NULL, NULL, NULL

# AVG returns the average, or NULL if any value is NULL. Errors
# on booleans or strings.
[plan]> SELECT AVG(id) FROM test
---
Aggregate: avg(test.id)
└─ Scan: test
2

!> SELECT AVG("bool") FROM test
---
Error: invalid input: can't add 0 and TRUE

> SELECT AVG("int") FROM test
---
11

> SELECT AVG("float") FROM test
> SELECT AVG("float") FROM test WHERE "float" IS NOT NAN
---
NaN
inf

!> SELECT AVG("string") FROM test
---
Error: invalid input: can't add 0 and ''

# Constant aggregates can be used with rows.
[plan]> SELECT COUNT(1), MIN(1), MAX(1), SUM(1), AVG(1) FROM test
---
Aggregate: count(1), min(1), max(1), sum(1), avg(1)
└─ Scan: test
6, 1, 1, 6, 1

# Constant aggregates can't be used with value rows.
!> SELECT *, COUNT(1), MIN(1), MAX(1), SUM(1), AVG(1) FROM test
!> SELECT id, COUNT(1), MIN(1), MAX(1), SUM(1), AVG(1) FROM test
---
Error: invalid input: column test.id must be used in an aggregate or GROUP BY expression
Error: invalid input: column id must be used in an aggregate or GROUP BY expression

# Repeated aggregates work.
[plan]> SELECT MAX("int"), MAX("int"), MAX("int") FROM test
---
Projection: #0, #0, #0
└─ Aggregate: max(test.int)
   └─ Scan: test
42, 42, 42

# Aggregate can be expression, both inside and outside the aggregate.
[plan]> SELECT SUM("int" * 10) / COUNT("int") + 7 FROM test WHERE "int" IS NOT NULL
---
Projection: #0 / #1 + 7
└─ Aggregate: sum(test.int * 10), count(test.int)
   └─ Scan: test (NOT test.int IS NULL)
117

# Aggregate functions can't be nested.
!> SELECT MIN(MAX("int")) FROM test
---
Error: invalid input: aggregate functions can't be nested

# Can't mix aggregate and non-aggregate expressions.
!> SELECT MAX("int") - "int" FROM test
---
Error: invalid input: column int must be used in an aggregate or GROUP BY expression


================================================
FILE: src/sql/testscripts/queries/clauses
================================================
# Tests the ordering of SELECT clauses.

> CREATE TABLE test ( \
    id INT PRIMARY KEY, \
    "bool" BOOLEAN, \
    "float" FLOAT, \
    "int" INT, \
    "string" STRING \
)
> INSERT INTO test VALUES (1, true, 3.14, 7, 'foo')
> INSERT INTO test VALUES (2, false, 2.718, 1, '👍')
> INSERT INTO test VALUES (3, NULL, NULL, NULL, NULL)
---
ok

# This is the only valid order of all clauses:
> SELECT COUNT(*) FROM test WHERE TRUE GROUP BY TRUE HAVING TRUE ORDER BY TRUE LIMIT 1 OFFSET 1
---
ok

# All clauses except SELECT are optional.
> SELECT COUNT(*) WHERE TRUE GROUP BY TRUE HAVING TRUE ORDER BY TRUE LIMIT 1 OFFSET 1
---
ok

> SELECT COUNT(*) FROM test GROUP BY TRUE HAVING TRUE ORDER BY TRUE LIMIT 1 OFFSET 1
---
ok

> SELECT COUNT(*) FROM test WHERE TRUE HAVING TRUE ORDER BY TRUE LIMIT 1 OFFSET 1
---
ok

> SELECT COUNT(*) FROM test WHERE TRUE GROUP BY TRUE ORDER BY TRUE LIMIT 1 OFFSET 1
---
ok

> SELECT COUNT(*) FROM test WHERE TRUE GROUP BY TRUE HAVING TRUE LIMIT 1 OFFSET 1
---
ok

> SELECT COUNT(*) FROM test WHERE TRUE GROUP BY TRUE HAVING TRUE ORDER BY TRUE OFFSET 1
---
ok

> SELECT COUNT(*) FROM test WHERE TRUE GROUP BY TRUE HAVING TRUE ORDER BY TRUE LIMIT 1
---
3

# The clause order is required. Moving any clause to the next position errors.
!> FROM test SELECT COUNT(*) WHERE TRUE GROUP BY TRUE HAVING TRUE ORDER BY TRUE LIMIT 1 OFFSET 1
---
Error: invalid input: unexpected token FROM

!> SELECT COUNT(*) FROM test GROUP BY TRUE WHERE TRUE HAVING TRUE ORDER BY TRUE LIMIT 1 OFFSET 1
---
Error: invalid input: unexpected token WHERE

!> SELECT COUNT(*) FROM test WHERE TRUE HAVING TRUE GROUP BY TRUE ORDER BY TRUE LIMIT 1 OFFSET 1
---
Error: invalid input: unexpected token GROUP

!> SELECT COUNT(*) FROM test WHERE TRUE ORDER BY TRUE GROUP BY TRUE HAVING TRUE LIMIT 1 OFFSET 1
---
Error: invalid input: unexpected token GROUP

!> SELECT COUNT(*) FROM test WHERE TRUE GROUP BY TRUE HAVING TRUE LIMIT 1 ORDER BY TRUE OFFSET 1
---
Error: invalid input: unexpected token ORDER

!> SELECT COUNT(*) FROM test WHERE TRUE GROUP BY TRUE HAVING TRUE ORDER BY TRUE OFFSET 1 LIMIT 1 
---
Error: invalid input: unexpected token LIMIT


================================================
FILE: src/sql/testscripts/queries/group_by
================================================
# Tests GROUP BY clauses. See "aggregate" for aggregate function tests.

> CREATE TABLE test ( \
    id INT PRIMARY KEY, \
    "group" STRING, \
    "bool" BOOLEAN, \
    "int" INTEGER, \
    "float" FLOAT, \
    "string" STRING \
)
> INSERT INTO test VALUES (0, NULL,   NULL, NULL,   NULL,      NULL)
> INSERT INTO test VALUES (1, 'a',    TRUE,   -1,   3.14,      '')
> INSERT INTO test VALUES (2, 'b',    FALSE,  0,    NAN,       'abc')
> INSERT INTO test VALUES (3, 'a',    TRUE,   3,    -0.0,      'AB')
> INSERT INTO test VALUES (4, 'b',    TRUE,   42,   INFINITY,  '👋')
> INSERT INTO test VALUES (5, 'a',    FALSE,  7,    NAN,       '')
> INSERT INTO test VALUES (6, 'b',    FALSE,  -1,   0.0,       'abc')

> CREATE TABLE other (id INT PRIMARY KEY, value STRING)
> INSERT INTO other VALUES (1, 'a'), (2, 'b')
---
ok

# Grouping with no input rows yields empty result.
[plan]> SELECT COUNT(id), MIN(id), MAX(id), SUM(id), AVG(id) FROM test WHERE FALSE GROUP BY id
---
Projection: #1, #2, #3, #4, #5
└─ Aggregate: test.id, count(test.id), min(test.id), max(test.id), sum(test.id), avg(test.id)
   └─ Nothing

# Simple GROUP BY, including NULL group.
[plan]> SELECT "group", COUNT(*) FROM test GROUP BY "group"
---
Aggregate: test.group, count(TRUE)
└─ Scan: test
NULL, 1
'a', 3
'b', 3

[plan]> SELECT "group", COUNT(*), MIN("bool"), MAX("string"), SUM("int"), AVG("float") \
    FROM test GROUP BY "group"
---
Aggregate: test.group, count(TRUE), min(test.bool), max(test.string), sum(test.int), avg(test.float)
└─ Scan: test
NULL, 1, NULL, NULL, NULL, NULL
'a', 3, FALSE, 'AB', 9, NaN
'b', 3, FALSE, '👋', 41, NaN

# GROUP BY works on booleans.
[plan]> SELECT "bool", COUNT(*) FROM test GROUP BY "bool"
---
Aggregate: test.bool, count(TRUE)
└─ Scan: test
NULL, 1
FALSE, 3
TRUE, 3

# GROUP BY works on integers.
[plan]> SELECT "int", COUNT(*) FROM test GROUP BY "int"
---
Aggregate: test.int, count(TRUE)
└─ Scan: test
NULL, 1
-1, 2
0, 1
3, 1
7, 1
42, 1

# GROUP BY works with floats, including a NAN group and -0.0 and 0.0 being equal.
[plan]> SELECT "float", COUNT(*) FROM test GROUP BY "float"
---
Aggregate: test.float, count(TRUE)
└─ Scan: test
NULL, 1
0.0, 2
3.14, 1
inf, 1
NaN, 2

# GROUP BY works on strings.
[plan]> SELECT "string", COUNT(*) FROM test GROUP BY "string"
---
Aggregate: test.string, count(TRUE)
└─ Scan: test
NULL, 1
'', 2
'AB', 1
'abc', 2
'👋', 1

# GROUP BY works even if the group column isn't in the result.
[plan]> SELECT COUNT(*) FROM test GROUP BY "group"
---
Projection: #1
└─ Aggregate: test.group, count(TRUE)
   └─ Scan: test
1
3
3

# GROUP BY works when there is no aggregate function.
[plan]> SELECT "group" FROM test GROUP BY "group"
---
Aggregate: test.group
└─ Scan: test
NULL
'a'
'b'

# GROUP BY does not work with SELECT aliases (also the case in e.g. SQL server).
!> SELECT "group" AS g, COUNT(*) FROM test GROUP BY g
---
Error: invalid input: unknown column g

[plan]> SELECT "group", COUNT(*) FROM test AS t GROUP BY t."group"
---
Aggregate: t.group, count(TRUE)
└─ Scan: test as t
NULL, 1
'a', 3
'b', 3

!> SELECT "group", COUNT(*) FROM test AS t GROUP BY test."group"
---
Error: invalid input: unknown table test

# It errors when there is a non-group column.
!> SELECT "group", id FROM test GROUP BY "group"
---
Error: invalid input: column id must be used in an aggregate or GROUP BY expression

# It errors on unknown tables and columns.
!> SELECT COUNT(*) FROM test GROUP BY unknown
!> SELECT COUNT(*) FROM test GROUP BY unknown.id
---
Error: invalid input: unknown column unknown
Error: invalid input: unknown table unknown

# GROUP BY can be arbitrary expressions.
[plan]> SELECT COUNT(*) FROM test GROUP BY 1
---
Projection: #1
└─ Aggregate: 1, count(TRUE)
   └─ Scan: test
7

[plan]> SELECT COUNT(*) FROM test GROUP BY id % 2
---
Projection: #1
└─ Aggregate: test.id % 2, count(TRUE)
   └─ Scan: test
4
3

# GROUP BY can use an expression also used in the SELECT.
[plan]> SELECT id % 2, COUNT(*) FROM test GROUP BY id % 2
---
Aggregate: test.id % 2, count(TRUE)
└─ Scan: test
0, 4
1, 3

# Can mix GROUP BY and aggregate expressions in SELECT.
[plan]> SELECT MAX("int") + id % 2 FROM test GROUP BY id
---
Projection: #1 + test.id % 2
└─ Aggregate: test.id, max(test.int)
   └─ Scan: test
NULL
0
0
4
42
8
-1

# GROUP BY can't use an aliased expression.
!> SELECT id % 2 AS mod, COUNT(*) FROM test GROUP BY mod
---
Error: invalid input: unknown column mod

# GROUP BY can't use aggregate functions.
!> SELECT COUNT(*) FROM test GROUP BY MIN(id)
---
Error: invalid input: unknown function min with 1 arguments

# GROUP BY works with multiple groups.
[plan]> SELECT "group", "bool", COUNT(*) FROM test GROUP BY "group", "bool"
---
Aggregate: test.group, test.bool, count(TRUE)
└─ Scan: test
NULL, NULL, 1
'a', FALSE, 1
'a', TRUE, 2
'b', FALSE, 2
'b', TRUE, 1

# Repeated GROUP BY column works.
[plan]> SELECT "group", "group", "group", COUNT(*) FROM test GROUP BY "group", "group"
---
Projection: test.group, test.group, test.group, #1
└─ Aggregate: test.group, count(TRUE)
   └─ Scan: test
NULL, NULL, NULL, 1
'a', 'a', 'a', 3
'b', 'b', 'b', 3

# GROUP BY works with joins.
[plan]> SELECT t.id % 2, COUNT(*) FROM test t JOIN other o ON t.id % 2 = o.id GROUP BY t.id % 2
---
Aggregate: t.id % 2, count(TRUE)
└─ NestedLoopJoin: inner on t.id % 2 = o.id
   ├─ Scan: test as t
   └─ Scan: other as o
1, 3

# SELECT * requires all columns to be in GROUP BY.
!> SELECT * FROM test GROUP BY id
---
Error: invalid input: column test.group must be used in an aggregate or GROUP BY expression

[plan]> SELECT * FROM test GROUP BY id, "group", "bool", "int", "float", "string"
---
Aggregate: test.id, test.group, test.bool, test.int, test.float, test.string
└─ Scan: test
0, NULL, NULL, NULL, NULL, NULL
1, 'a', TRUE, -1, 3.14, ''
2, 'b', FALSE, 0, NaN, 'abc'
3, 'a', TRUE, 3, 0.0, 'AB'
4, 'b', TRUE, 42, inf, '👋'
5, 'a', FALSE, 7, NaN, ''
6, 'b', FALSE, -1, 0.0, 'abc'

[plan]> SELECT * FROM test GROUP BY "bool", "int", "float", "string", "group", id
---
Projection: test.id, test.group, test.bool, test.int, test.float, test.string
└─ Aggregate: test.bool, test.int, test.float, test.string, test.group, test.id
   └─ Scan: test
0, NULL, NULL, NULL, NULL, NULL
6, 'b', FALSE, -1, 0.0, 'abc'
2, 'b', FALSE, 0, NaN, 'abc'
5, 'a', FALSE, 7, NaN, ''
1, 'a', TRUE, -1, 3.14, ''
3, 'a', TRUE, 3, 0.0, 'AB'
4, 'b', TRUE, 42, inf, '👋'


================================================
FILE: src/sql/testscripts/queries/having
================================================
# Tests HAVING clauses. See "aggregate" and "group_by" for related tests.

> CREATE TABLE test ( \
    id INT PRIMARY KEY, \
    "group" STRING, \
    "bool" BOOLEAN, \
    "int" INTEGER, \
    "float" FLOAT, \
    "string" STRING \
)
> INSERT INTO test VALUES (0, NULL,   NULL, NULL,   NULL,      NULL)
> INSERT INTO test VALUES (1, 'a',    TRUE,   -1,   3.14,      '')
> INSERT INTO test VALUES (2, 'b',    FALSE,  0,    NAN,       'abc')
> INSERT INTO test VALUES (3, 'a',    TRUE,   3,    -0.0,      'AB')
> INSERT INTO test VALUES (4, 'b',    TRUE,   42,   INFINITY,  '👋')
> INSERT INTO test VALUES (5, 'a',    FALSE,  7,    NAN,       '')
> INSERT INTO test VALUES (6, 'b',    FALSE,  -1,   0.0,       'abc')
---
ok

# Having requires an aggregate function or GROUP BY clause.
!> SELECT * FROM test HAVING id > 3
---
Error: invalid input: HAVING requires GROUP BY or aggregate function

> SELECT COUNT(*) FROM test HAVING COUNT(*) > 0
---
7

> SELECT TRUE FROM test GROUP BY id HAVING id > 0
---
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE

# Having works with an aggregate function, even if it's not in SELECT.
[plan]> SELECT "group", MAX("int") FROM test GROUP BY "group" HAVING MAX("int") > 10
---
Filter: #1 > 10
└─ Aggregate: test.group, max(test.int)
   └─ Scan: test
'b', 42

[plan]> SELECT "group" FROM test GROUP BY "group" HAVING MAX("int") > 10
---
Remap: test.group (dropped: #1)
└─ Filter: #1 > 10
   └─ Aggregate: test.group, max(test.int)
      └─ Scan: test
'b'

[plan]> SELECT "group", MAX("int") FROM test GROUP BY "group" HAVING MAX("int") - MIN("int") > 10
---
Remap: test.group, #1 (dropped: #2)
└─ Filter: #1 - #2 > 10
   └─ Aggregate: test.group, max(test.int), min(test.int)
      └─ Scan: test
'b', 42

# Having works with SELECT aliases.
[plan]> SELECT "group", MAX("int") AS m FROM test GROUP BY "group" HAVING m > 10
---
Filter: m > 10
└─ Projection: test.group, #1 as m
   └─ Aggregate: test.group, max(test.int)
      └─ Scan: test
'b', 42

# Having works with an aggregate function not in the SELECT clause.
[plan]> SELECT "group", COUNT(*) FROM test GROUP BY "group" HAVING MAX("int") > 10
---
Remap: test.group, #1 (dropped: #2)
└─ Filter: #2 > 10
   └─ Aggregate: test.group, count(TRUE), max(test.int)
      └─ Scan: test
'b', 3

# Having works with compound expressions.
[plan]> SELECT "group", COUNT(*) FROM test GROUP BY "group" HAVING MAX("int") / COUNT(*) > 3
---
Remap: test.group, #1 (dropped: #2)
└─ Filter: #2 / #1 > 3
   └─ Aggregate: test.group, count(TRUE), max(test.int)
      └─ Scan: test
'b', 3

# Having works with compound expressions using complex GROUP BY expressions
# that are not on the SELECT clause.
[plan]> SELECT COUNT(*) FROM test GROUP BY id % 2 HAVING 2 - id % 2 + 1 > 1
---
Remap: #0 (dropped: #1)
└─ Filter: 2 - #1 + 1 > 1
   └─ Projection: #1, #0
      └─ Aggregate: test.id % 2, count(TRUE)
         └─ Scan: test
4
3

# Having can use (un)qualified expressions for an (un)qualified GROUP BY.
[plan]> SELECT COUNT(*) FROM test GROUP BY "group" HAVING test."group" = 'a'
---
Remap: #0 (dropped: test.group)
└─ Filter: test.group = 'a'
   └─ Projection: #1, test.group
      └─ Aggregate: test.group, count(TRUE)
         └─ Scan: test
3

[plan]> SELECT COUNT(*) FROM test GROUP BY test."group" HAVING "group" = 'a'
---
Remap: #0 (dropped: test.group)
└─ Filter: test.group = 'a'
   └─ Projection: #1, test.group
      └─ Aggregate: test.group, count(TRUE)
         └─ Scan: test
3

# Having errors on nested aggregate functions.
!> SELECT "group", COUNT(*) FROM test GROUP BY "group" HAVING MAX(MIN("int")) > 0
---
Error: invalid input: aggregate functions can't be nested

# Having errors on columns not in the SELECT or GROUP BY clauses.
!> SELECT "group", COUNT(*) FROM test GROUP BY "group" HAVING id > 3
---
Error: invalid input: column id must be used in an aggregate or GROUP BY expression


================================================
FILE: src/sql/testscripts/queries/join_cross
================================================
# Tests cross joins.

# Set up a movies dataset.
> CREATE TABLE countries ( \
    id STRING PRIMARY KEY, \
    name STRING NOT NULL \
)
> INSERT INTO countries VALUES \
    ('fr', 'France'), \
    ('ru', 'Russia'), \
    ('us', 'United States of America')
>CREATE TABLE genres ( \
    id INTEGER PRIMARY KEY, \
    name STRING NOT NULL \
)
> INSERT INTO genres VALUES \
    (1, 'Science Fiction'), \
    (2, 'Action'), \
    (3, 'Comedy')
> CREATE TABLE studios ( \
    id INTEGER PRIMARY KEY, \
    name STRING NOT NULL, \
    country_id STRING INDEX REFERENCES countries \
)
> INSERT INTO studios VALUES \
    (1, 'Mosfilm', 'ru'), \
    (2, 'Lionsgate', 'us'), \
    (3, 'StudioCanal', 'fr'), \
    (4, 'Warner Bros', 'us')
> CREATE TABLE movies ( \
    id INTEGER PRIMARY KEY, \
    title STRING NOT NULL, \
    studio_id INTEGER NOT NULL INDEX REFERENCES studios, \
    genre_id INTEGER NOT NULL INDEX REFERENCES genres, \
    released INTEGER NOT NULL, \
    rating FLOAT, \
    ultrahd BOOLEAN \
)
> INSERT INTO movies VALUES \
    (1, 'Stalker', 1, 1, 1979, 8.2, NULL), \
    (2, 'Sicario', 2, 2, 2015, 7.6, TRUE), \
    (3, 'Primer', 3, 1, 2004, 6.9, NULL), \
    (4, 'Heat', 4, 2, 1995, 8.2, TRUE), \
    (5, 'The Fountain', 4, 1, 2006, 7.2, FALSE), \
    (6, 'Solaris', 1, 1, 1972, 8.1, NULL), \
    (7, 'Gravity', 4, 1, 2013, 7.7, TRUE), \
    (8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE), \
    (9, 'Birdman', 4, 3, 2014, 7.7, TRUE), \
    (10, 'Inception', 4, 1, 2010, 8.8, TRUE)
---
ok

# Explicit cross join.
[plan,header]> SELECT * FROM movies CROSS JOIN genres
---
NestedLoopJoin: inner
├─ Scan: movies
└─ Scan: genres
movies.id, movies.title, movies.studio_id, movies.genre_id, movies.released, movies.rating, movies.ultrahd, genres.id, genres.name
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy'

# Explicit triple cross join.
[plan,header]> SELECT * FROM movies CROSS JOIN genres CROSS JOIN studios
---
NestedLoopJoin: inner
├─ NestedLoopJoin: inner
│  ├─ Scan: movies
│  └─ Scan: genres
└─ Scan: studios
movies.id, movies.title, movies.studio_id, movies.genre_id, movies.released, movies.rating, movies.ultrahd, genres.id, genres.name, studios.id, studios.name, studios.country_id
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action', 1, 'Mosfilm', 'ru'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action', 2, 'Lionsgate', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action', 3, 'StudioCanal', 'fr'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action', 4, 'Warner Bros', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy', 1, 'Mosfilm', 'ru'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy', 2, 'Lionsgate', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy', 3, 'StudioCanal', 'fr'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy', 4, 'Warner Bros', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action', 1, 'Mosfilm', 'ru'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action', 2, 'Lionsgate', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action', 3, 'StudioCanal', 'fr'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action', 4, 'Warner Bros', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy', 1, 'Mosfilm', 'ru'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy', 2, 'Lionsgate', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy', 3, 'StudioCanal', 'fr'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy', 4, 'Warner Bros', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action', 1, 'Mosfilm', 'ru'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action', 2, 'Lionsgate', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action', 3, 'StudioCanal', 'fr'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action', 4, 'Warner Bros', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy', 2, 'Lionsgate', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy', 4, 'Warner Bros', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action', 1, 'Mosfilm', 'ru'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action', 2, 'Lionsgate', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action', 3, 'StudioCanal', 'fr'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action', 4, 'Warner Bros', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy', 1, 'Mosfilm', 'ru'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy', 2, 'Lionsgate', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy', 3, 'StudioCanal', 'fr'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy', 4, 'Warner Bros', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'

# Explicit cross join with other order.
[plan,header]> SELECT * FROM studios CROSS JOIN movies CROSS JOIN genres
---
NestedLoopJoin: inner
├─ NestedLoopJoin: inner
│  ├─ Scan: studios
│  └─ Scan: movies
└─ Scan: genres
studios.id, studios.name, studios.country_id, movies.id, movies.title, movies.studio_id, movies.genre_id, movies.released, movies.rating, movies.ultrahd, genres.id, genres.name
1, 'Mosfilm', 'ru', 1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
1, 'Mosfilm', 'ru', 1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action'
1, 'Mosfilm', 'ru', 1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy'
1, 'Mosfilm', 'ru', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction'
1, 'Mosfilm', 'ru', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'
1, 'Mosfilm', 'ru', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy'
1, 'Mosfilm', 'ru', 3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction'
1, 'Mosfilm', 'ru', 3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action'
1, 'Mosfilm', 'ru', 3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy'
1, 'Mosfilm', 'ru', 4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction'
1, 'Mosfilm', 'ru', 4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action'
1, 'Mosfilm', 'ru', 4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy'
1, 'Mosfilm', 'ru', 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction'
1, 'Mosfilm', 'ru', 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action'
1, 'Mosfilm', 'ru', 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy'
1, 'Mosfilm', 'ru', 6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction'
1, 'Mosfilm', 'ru', 6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action'
1, 'Mosfilm', 'ru', 6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy'
1, 'Mosfilm', 'ru', 7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction'
1, 'Mosfilm', 'ru', 7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action'
1, 'Mosfilm', 'ru', 7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy'
1, 'Mosfilm', 'ru', 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction'
1, 'Mosfilm', 'ru', 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action'
1, 'Mosfilm', 'ru', 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy'
1, 'Mosfilm', 'ru', 9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction'
1, 'Mosfilm', 'ru', 9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action'
1, 'Mosfilm', 'ru', 9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy'
1, 'Mosfilm', 'ru', 10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction'
1, 'Mosfilm', 'ru', 10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action'
1, 'Mosfilm', 'ru', 10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy'
2, 'Lionsgate', 'us', 1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
2, 'Lionsgate', 'us', 1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action'
2, 'Lionsgate', 'us', 1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy'
2, 'Lionsgate', 'us', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction'
2, 'Lionsgate', 'us', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'
2, 'Lionsgate', 'us', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy'
2, 'Lionsgate', 'us', 3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction'
2, 'Lionsgate', 'us', 3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action'
2, 'Lionsgate', 'us', 3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy'
2, 'Lionsgate', 'us', 4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction'
2, 'Lionsgate', 'us', 4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action'
2, 'Lionsgate', 'us', 4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy'
2, 'Lionsgate', 'us', 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction'
2, 'Lionsgate', 'us', 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action'
2, 'Lionsgate', 'us', 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy'
2, 'Lionsgate', 'us', 6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction'
2, 'Lionsgate', 'us', 6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action'
2, 'Lionsgate', 'us', 6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy'
2, 'Lionsgate', 'us', 7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction'
2, 'Lionsgate', 'us', 7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action'
2, 'Lionsgate', 'us', 7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy'
2, 'Lionsgate', 'us', 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction'
2, 'Lionsgate', 'us', 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action'
2, 'Lionsgate', 'us', 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy'
2, 'Lionsgate', 'us', 9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction'
2, 'Lionsgate', 'us', 9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action'
2, 'Lionsgate', 'us', 9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy'
2, 'Lionsgate', 'us', 10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction'
2, 'Lionsgate', 'us', 10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action'
2, 'Lionsgate', 'us', 10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy'
3, 'StudioCanal', 'fr', 1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
3, 'StudioCanal', 'fr', 1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action'
3, 'StudioCanal', 'fr', 1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy'
3, 'StudioCanal', 'fr', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction'
3, 'StudioCanal', 'fr', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'
3, 'StudioCanal', 'fr', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy'
3, 'StudioCanal', 'fr', 3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction'
3, 'StudioCanal', 'fr', 3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action'
3, 'StudioCanal', 'fr', 3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy'
3, 'StudioCanal', 'fr', 4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction'
3, 'StudioCanal', 'fr', 4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action'
3, 'StudioCanal', 'fr', 4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy'
3, 'StudioCanal', 'fr', 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction'
3, 'StudioCanal', 'fr', 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action'
3, 'StudioCanal', 'fr', 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy'
3, 'StudioCanal', 'fr', 6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction'
3, 'StudioCanal', 'fr', 6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action'
3, 'StudioCanal', 'fr', 6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy'
3, 'StudioCanal', 'fr', 7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction'
3, 'StudioCanal', 'fr', 7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action'
3, 'StudioCanal', 'fr', 7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy'
3, 'StudioCanal', 'fr', 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction'
3, 'StudioCanal', 'fr', 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action'
3, 'StudioCanal', 'fr', 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy'
3, 'StudioCanal', 'fr', 9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction'
3, 'StudioCanal', 'fr', 9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action'
3, 'StudioCanal', 'fr', 9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy'
3, 'StudioCanal', 'fr', 10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction'
3, 'StudioCanal', 'fr', 10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action'
3, 'StudioCanal', 'fr', 10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy'
4, 'Warner Bros', 'us', 1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
4, 'Warner Bros', 'us', 1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action'
4, 'Warner Bros', 'us', 1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy'
4, 'Warner Bros', 'us', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction'
4, 'Warner Bros', 'us', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'
4, 'Warner Bros', 'us', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy'
4, 'Warner Bros', 'us', 3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction'
4, 'Warner Bros', 'us', 3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action'
4, 'Warner Bros', 'us', 3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy'
4, 'Warner Bros', 'us', 4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction'
4, 'Warner Bros', 'us', 4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action'
4, 'Warner Bros', 'us', 4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy'
4, 'Warner Bros', 'us', 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction'
4, 'Warner Bros', 'us', 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action'
4, 'Warner Bros', 'us', 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy'
4, 'Warner Bros', 'us', 6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction'
4, 'Warner Bros', 'us', 6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action'
4, 'Warner Bros', 'us', 6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy'
4, 'Warner Bros', 'us', 7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction'
4, 'Warner Bros', 'us', 7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action'
4, 'Warner Bros', 'us', 7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy'
4, 'Warner Bros', 'us', 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction'
4, 'Warner Bros', 'us', 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action'
4, 'Warner Bros', 'us', 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy'
4, 'Warner Bros', 'us', 9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction'
4, 'Warner Bros', 'us', 9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action'
4, 'Warner Bros', 'us', 9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy'
4, 'Warner Bros', 'us', 10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction'
4, 'Warner Bros', 'us', 10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action'
4, 'Warner Bros', 'us', 10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy'

# Implicit cross join.
[plan,header]> SELECT * FROM movies, genres, studios
---
NestedLoopJoin: inner
├─ NestedLoopJoin: inner
│  ├─ Scan: movies
│  └─ Scan: genres
└─ Scan: studios
movies.id, movies.title, movies.studio_id, movies.genre_id, movies.released, movies.rating, movies.ultrahd, genres.id, genres.name, studios.id, studios.name, studios.country_id
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action', 1, 'Mosfilm', 'ru'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action', 2, 'Lionsgate', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action', 3, 'StudioCanal', 'fr'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action', 4, 'Warner Bros', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy', 1, 'Mosfilm', 'ru'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy', 2, 'Lionsgate', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy', 3, 'StudioCanal', 'fr'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy', 4, 'Warner Bros', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action', 1, 'Mosfilm', 'ru'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action', 2, 'Lionsgate', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action', 3, 'StudioCanal', 'fr'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action', 4, 'Warner Bros', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy', 1, 'Mosfilm', 'ru'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy', 2, 'Lionsgate', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy', 3, 'StudioCanal', 'fr'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy', 4, 'Warner Bros', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action', 1, 'Mosfilm', 'ru'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action', 2, 'Lionsgate', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action', 3, 'StudioCanal', 'fr'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action', 4, 'Warner Bros', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy', 2, 'Lionsgate', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy', 4, 'Warner Bros', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action', 1, 'Mosfilm', 'ru'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action', 2, 'Lionsgate', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action', 3, 'StudioCanal', 'fr'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action', 4, 'Warner Bros', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy', 1, 'Mosfilm', 'ru'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy', 2, 'Lionsgate', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy', 3, 'StudioCanal', 'fr'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy', 4, 'Warner Bros', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'

# Aliased cross join.
[plan,header]> SELECT * FROM movies m, genres AS g, studios s
---
NestedLoopJoin: inner
├─ NestedLoopJoin: inner
│  ├─ Scan: movies as m
│  └─ Scan: genres as g
└─ Scan: studios as s
m.id, m.title, m.studio_id, m.genre_id, m.released, m.rating, m.ultrahd, g.id, g.name, s.id, s.name, s.country_id
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action', 1, 'Mosfilm', 'ru'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action', 2, 'Lionsgate', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action', 3, 'StudioCanal', 'fr'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action', 4, 'Warner Bros', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy', 1, 'Mosfilm', 'ru'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy', 2, 'Lionsgate', 'us'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy', 3, 'StudioCanal', 'fr'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy', 4, 'Warner Bros', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action', 1, 'Mosfilm', 'ru'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action', 2, 'Lionsgate', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action', 3, 'StudioCanal', 'fr'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action', 4, 'Warner Bros', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy', 1, 'Mosfilm', 'ru'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy', 2, 'Lionsgate', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy', 3, 'StudioCanal', 'fr'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy', 4, 'Warner Bros', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action', 1, 'Mosfilm', 'ru'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action', 2, 'Lionsgate', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action', 3, 'StudioCanal', 'fr'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action', 4, 'Warner Bros', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy', 2, 'Lionsgate', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy', 4, 'Warner Bros', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action', 1, 'Mosfilm', 'ru'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action', 2, 'Lionsgate', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action', 3, 'StudioCanal', 'fr'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action', 4, 'Warner Bros', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy', 1, 'Mosfilm', 'ru'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy', 2, 'Lionsgate', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy', 3, 'StudioCanal', 'fr'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy', 4, 'Warner Bros', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction', 2, 'Lionsgate', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action', 1, 'Mosfilm', 'ru'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action', 3, 'StudioCanal', 'fr'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy', 1, 'Mosfilm', 'ru'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy', 3, 'StudioCanal', 'fr'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'

# Cross join with self errors.
!> SELECT * FROM genres, genres
---
Error: invalid input: duplicate table name genres

# But it works when aliased.
[plan,header]> SELECT * FROM genres AS a, genres AS b, genres AS c
---
NestedLoopJoin: inner
├─ NestedLoopJoin: inner
│  ├─ Scan: genres as a
│  └─ Scan: genres as b
└─ Scan: genres as c
a.id, a.name, b.id, b.name, c.id, c.name
1, 'Science Fiction', 1, 'Science Fiction', 1, 'Science Fiction'
1, 'Science Fiction', 1, 'Science Fiction', 2, 'Action'
1, 'Science Fiction', 1, 'Science Fiction', 3, 'Comedy'
1, 'Science Fiction', 2, 'Action', 1, 'Science Fiction'
1, 'Science Fiction', 2, 'Action', 2, 'Action'
1, 'Science Fiction', 2, 'Action', 3, 'Comedy'
1, 'Science Fiction', 3, 'Comedy', 1, 'Science Fiction'
1, 'Science Fiction', 3, 'Comedy', 2, 'Action'
1, 'Science Fiction', 3, 'Comedy', 3, 'Comedy'
2, 'Action', 1, 'Science Fiction', 1, 'Science Fiction'
2, 'Action', 1, 'Science Fiction', 2, 'Action'
2, 'Action', 1, 'Science Fiction', 3, 'Comedy'
2, 'Action', 2, 'Action', 1, 'Science Fiction'
2, 'Action', 2, 'Action', 2, 'Action'
2, 'Action', 2, 'Action', 3, 'Comedy'
2, 'Action', 3, 'Comedy', 1, 'Science Fiction'
2, 'Action', 3, 'Comedy', 2, 'Action'
2, 'Action', 3, 'Comedy', 3, 'Comedy'
3, 'Comedy', 1, 'Science Fiction', 1, 'Science Fiction'
3, 'Comedy', 1, 'Science Fiction', 2, 'Action'
3, 'Comedy', 1, 'Science Fiction', 3, 'Comedy'
3, 'Comedy', 2, 'Action', 1, 'Science Fiction'
3, 'Comedy', 2, 'Action', 2, 'Action'
3, 'Comedy', 2, 'Action', 3, 'Comedy'
3, 'Comedy', 3, 'Comedy', 1, 'Science Fiction'
3, 'Comedy', 3, 'Comedy', 2, 'Action'
3, 'Comedy', 3, 'Comedy', 3, 'Comedy'

# Duplicate aliases error.
!> SELECT * FROM movies a, genres a
!> SELECT * FROM movies a CROSS JOIN genres a
---
Error: invalid input: duplicate table name a
Error: invalid input: duplicate table name a

# An explicit CROSS JOIN with an ON predicate should error. It's not a cross join.
!> SELECT * FROM movies CROSS JOIN genres ON movies.genre_id = genres.id
---
Error: invalid input: unexpected token ON


================================================
FILE: src/sql/testscripts/queries/join_inner
================================================
# Tests inner joins.

# Set up a movies dataset.
> CREATE TABLE countries ( \
    id STRING PRIMARY KEY, \
    name STRING NOT NULL \
)
> INSERT INTO countries VALUES \
    ('fr', 'France'), \
    ('ru', 'Russia'), \
    ('us', 'United States of America')
>CREATE TABLE genres ( \
    id INTEGER PRIMARY KEY, \
    name STRING NOT NULL \
)
> INSERT INTO genres VALUES \
    (1, 'Science Fiction'), \
    (2, 'Action'), \
    (3, 'Comedy')
> CREATE TABLE studios ( \
    id INTEGER PRIMARY KEY, \
    name STRING NOT NULL, \
    country_id STRING INDEX REFERENCES countries \
)
> INSERT INTO studios VALUES \
    (1, 'Mosfilm', 'ru'), \
    (2, 'Lionsgate', 'us'), \
    (3, 'StudioCanal', 'fr'), \
    (4, 'Warner Bros', 'us')
> CREATE TABLE movies ( \
    id INTEGER PRIMARY KEY, \
    title STRING NOT NULL, \
    studio_id INTEGER NOT NULL INDEX REFERENCES studios, \
    genre_id INTEGER NOT NULL INDEX REFERENCES genres, \
    released INTEGER NOT NULL, \
    rating FLOAT, \
    ultrahd BOOLEAN \
)
> INSERT INTO movies VALUES \
    (1, 'Stalker', 1, 1, 1979, 8.2, NULL), \
    (2, 'Sicario', 2, 2, 2015, 7.6, TRUE), \
    (3, 'Primer', 3, 1, 2004, 6.9, NULL), \
    (4, 'Heat', 4, 2, 1995, 8.2, TRUE), \
    (5, 'The Fountain', 4, 1, 2006, 7.2, FALSE), \
    (6, 'Solaris', 1, 1, 1972, 8.1, NULL), \
    (7, 'Gravity', 4, 1, 2013, 7.7, TRUE), \
    (8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE), \
    (9, 'Birdman', 4, 3, 2014, 7.7, TRUE), \
    (10, 'Inception', 4, 1, 2010, 8.8, TRUE)
---
ok

# Inner join on foreign key.
[plan,header]> SELECT * FROM movies INNER JOIN genres ON movies.genre_id = genres.id
---
HashJoin: inner on movies.genre_id = genres.id
├─ Scan: movies
└─ Scan: genres
movies.id, movies.title, movies.studio_id, movies.genre_id, movies.released, movies.rating, movies.ultrahd, genres.id, genres.name
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction'

# Implicit inner join.
[plan,header]> SELECT * FROM movies JOIN genres ON movies.genre_id = genres.id
---
HashJoin: inner on movies.genre_id = genres.id
├─ Scan: movies
└─ Scan: genres
movies.id, movies.title, movies.studio_id, movies.genre_id, movies.released, movies.rating, movies.ultrahd, genres.id, genres.name
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction'

# Implicit inner join with cross join and WHERE.
[plan,header]> SELECT * FROM movies, genres WHERE movies.genre_id = genres.id
---
HashJoin: inner on movies.genre_id = genres.id
├─ Scan: movies
└─ Scan: genres
movies.id, movies.title, movies.studio_id, movies.genre_id, movies.released, movies.rating, movies.ultrahd, genres.id, genres.name
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction'

# Three-way inner join.
[plan,header]> SELECT * FROM movies \
    INNER JOIN genres ON movies.genre_id = genres.id \
    INNER JOIN studios ON movies.studio_id = studios.id
---
HashJoin: inner on movies.studio_id = studios.id
├─ HashJoin: inner on movies.genre_id = genres.id
│  ├─ Scan: movies
│  └─ Scan: genres
└─ Scan: studios
movies.id, movies.title, movies.studio_id, movies.genre_id, movies.released, movies.rating, movies.ultrahd, genres.id, genres.name, studios.id, studios.name, studios.country_id
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'

# Inner join on TRUE and FALSE.
[plan]> SELECT * FROM movies INNER JOIN genres ON TRUE
---
NestedLoopJoin: inner
├─ Scan: movies
└─ Scan: genres
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 1, 'Science Fiction'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 2, 'Action'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 1, 'Science Fiction'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 3, 'Comedy'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 2, 'Action'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 3, 'Comedy'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 2, 'Action'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 3, 'Comedy'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 2, 'Action'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 3, 'Comedy'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 1, 'Science Fiction'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 2, 'Action'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 1, 'Science Fiction'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 2, 'Action'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 2, 'Action'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 3, 'Comedy'

[plan]> SELECT * FROM movies INNER JOIN genres ON FALSE
---
Nothing

# Inner join on multiple predicates.
[plan]> SELECT * FROM movies INNER JOIN genres ON movies.genre_id = genres.id AND movies.id = genres.id
---
NestedLoopJoin: inner on movies.genre_id = genres.id AND movies.id = genres.id
├─ Scan: movies
└─ Scan: genres
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'

# Aliased inner join.
[plan]> SELECT * FROM movies m INNER JOIN genres g ON m.genre_id = g.id INNER JOIN studios AS s ON m.studio_id = s.id
---
HashJoin: inner on m.studio_id = s.id
├─ HashJoin: inner on m.genre_id = g.id
│  ├─ Scan: movies as m
│  └─ Scan: genres as g
└─ Scan: studios as s
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action', 2, 'Lionsgate', 'us'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 1, 'Science Fiction', 3, 'StudioCanal', 'fr'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, 2, 'Action', 4, 'Warner Bros', 'us'
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
6, 'Solaris', 1, 1, 1972, 8.1, NULL, 1, 'Science Fiction', 1, 'Mosfilm', 'ru'
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, 3, 'Comedy', 2, 'Lionsgate', 'us'
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, 3, 'Comedy', 4, 'Warner Bros', 'us'
10, 'Inception', 4, 1, 2010, 8.8, TRUE, 1, 'Science Fiction', 4, 'Warner Bros', 'us'

# Inner join with missing ON errors.
!> SELECT * FROM movies INNER JOIN genres
---
Error: invalid input: unexpected end of input

# Inner join on WHERE errors.
!> SELECT * FROM movies INNER JOIN genres WHERE movies.genre_id = genres.id
---
Error: invalid input: expected token ON, found WHERE

# Errors on missing table or column.
!> SELECT * FROM movies INNER JOIN unknown on movies.id = unknown.id
!> SELECT * FROM movies INNER JOIN genres on movies.unknown_id = genres.id
---
Error: invalid input: table unknown does not exist
Error: invalid input: unknown column movies.unknown_id

# Hash joins with multiple matches work, on either side of the join.
[plan]> SELECT movies.title, genres.name FROM movies JOIN genres ON movies.genre_id = genres.id
---
Projection: movies.title, genres.name
└─ HashJoin: inner on movies.genre_id = genres.id
   ├─ Scan: movies
   └─ Scan: genres
'Stalker', 'Science Fiction'
'Sicario', 'Action'
'Primer', 'Science Fiction'
'Heat', 'Action'
'The Fountain', 'Science Fiction'
'Solaris', 'Science Fiction'
'Gravity', 'Science Fiction'
'Blindspotting', 'Comedy'
'Birdman', 'Comedy'
'Inception', 'Science Fiction'

[plan]> SELECT movies.title, genres.name FROM genres JOIN movies ON genres.id = movies.genre_id
---
Projection: movies.title, genres.name
└─ HashJoin: inner on genres.id = movies.genre_id
   ├─ Scan: genres
   └─ Scan: movies
'Stalker', 'Science Fiction'
'Primer', 'Science Fiction'
'The Fountain', 'Science Fiction'
'Solaris', 'Science Fiction'
'Gravity', 'Science Fiction'
'Inception', 'Science Fiction'
'Sicario', 'Action'
'Heat', 'Action'
'Blindspotting', 'Comedy'
'Birdman', 'Comedy'

# Also try multi-match self hash joins joins on ultrahd, where both sides have
# multiple matches. Note that NULL matches are ignored.
[plan]> SELECT a.title, b.title FROM movies a JOIN movies b ON a.ultrahd = b.ultrahd
---
Projection: a.title, b.title
└─ HashJoin: inner on a.ultrahd = b.ultrahd
   ├─ Scan: movies as a
   └─ Scan: movies as b
'Sicario', 'Sicario'
'Sicario', 'Heat'
'Sicario', 'Gravity'
'Sicario', 'Blindspotting'
'Sicario', 'Birdman'
'Sicario', 'Inception'
'Heat', 'Sicario'
'Heat', 'Heat'
'Heat', 'Gravity'
'Heat', 'Blindspotting'
'Heat', 'Birdman'
'Heat', 'Inception'
'The Fountain', 'The Fountain'
'Gravity', 'Sicario'
'Gravity', 'Heat'
'Gravity', 'Gravity'
'Gravity', 'Blindspotting'
'Gravity', 'Birdman'
'Gravity', 'Inception'
'Blindspotting', 'Sicario'
'Blindspotting', 'Heat'
'Blindspotting', 'Gravity'
'Blindspotting', 'Blindspotting'
'Blindspotting', 'Birdman'
'Blindspotting', 'Inception'
'Birdman', 'Sicario'
'Birdman', 'Heat'
'Birdman', 'Gravity'
'Birdman', 'Blindspotting'
'Birdman', 'Birdman'
'Birdman', 'Inception'
'Inception', 'Sicario'
'Inception', 'Heat'
'Inception', 'Gravity'
'Inception', 'Blindspotting'
'Inception', 'Birdman'
'Inception', 'Inception'

# Try a complex multi-way join with multiple joins of the same table. Uses GROUP
# BY to discard duplicates from the cross join. The query finds all movies
# belonging to a studio that's released at least one movies rated 8 or higher.
[plan]> SELECT m.id, m.title, g.name AS genre, s.name AS studio, m.rating \
  FROM movies m JOIN genres g ON m.genre_id = g.id, \
    studios s JOIN movies good ON good.studio_id = s.id AND good.rating >= 8 \
  WHERE m.studio_id = s.id \
  GROUP BY m.id, m.title, g.name, s.name, m.rating, m.released \
  ORDER BY m.rating DESC, m.released ASC, m.id ASC
---
Remap: m.id, m.title, genre, studio, m.rating (dropped: m.released)
└─ Order: m.rating desc, m.released asc, m.id asc
   └─ Projection: m.id, m.title, g.name as genre, s.name as studio, m.rating, m.released
      └─ Aggregate: m.id, m.title, g.name, s.name, m.rating, m.released
         └─ HashJoin: inner on m.studio_id = s.id
            ├─ HashJoin: inner on m.genre_id = g.id
            │  ├─ Scan: movies as m
            │  └─ Scan: genres as g
            └─ HashJoin: inner on s.id = good.studio_id
               ├─ Scan: studios as s
               └─ Scan: movies as good (good.rating > 8 OR good.rating = 8)
10, 'Inception', 'Science Fiction', 'Warner Bros', 8.8
1, 'Stalker', 'Science Fiction', 'Mosfilm', 8.2
4, 'Heat', 'Action', 'Warner Bros', 8.2
6, 'Solaris', 'Science Fiction', 'Mosfilm', 8.1
7, 'Gravity', 'Science Fiction', 'Warner Bros', 7.7
9, 'Birdman', 'Comedy', 'Warner Bros', 7.7
5, 'The Fountain', 'Science Fiction', 'Warner Bros', 7.2


================================================
FILE: src/sql/testscripts/queries/join_outer
================================================
# Tests left/right outer joins.

# Set up a movies dataset.
> CREATE TABLE countries ( \
    id STRING PRIMARY KEY, \
    name STRING NOT NULL \
)
> INSERT INTO countries VALUES \
    ('fr', 'France'), \
    ('ru', 'Russia'), \
    ('us', 'United States of America')
>CREATE TABLE genres ( \
    id INTEGER PRIMARY KEY, \
    name STRING NOT NULL \
)
> INSERT INTO genres VALUES \
    (1, 'Science Fiction'), \
    (2, 'Action'), \
    (3, 'Comedy')
> CREATE TABLE studios ( \
    id INTEGER PRIMARY KEY, \
    name STRING NOT NULL, \
    country_id STRING INDEX REFERENCES countries \
)
> INSERT INTO studios VALUES \
    (1, 'Mosfilm', 'ru'), \
    (2, 'Lionsgate', 'us'), \
    (3, 'StudioCanal', 'fr'), \
    (4, 'Warner Bros', 'us')
> CREATE TABLE movies ( \
    id INTEGER PRIMARY KEY, \
    title STRING NOT NULL, \
    studio_id INTEGER NOT NULL INDEX REFERENCES studios, \
    genre_id INTEGER NOT NULL INDEX REFERENCES genres, \
    released INTEGER NOT NULL, \
    rating FLOAT, \
    ultrahd BOOLEAN \
)
> INSERT INTO movies VALUES \
    (1, 'Stalker', 1, 1, 1979, 8.2, NULL), \
    (2, 'Sicario', 2, 2, 2015, 7.6, TRUE), \
    (3, 'Primer', 3, 1, 2004, 6.9, NULL), \
    (4, 'Heat', 4, 2, 1995, 8.2, TRUE), \
    (5, 'The Fountain', 4, 1, 2006, 7.2, FALSE), \
    (6, 'Solaris', 1, 1, 1972, 8.1, NULL), \
    (7, 'Gravity', 4, 1, 2013, 7.7, TRUE), \
    (8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE), \
    (9, 'Birdman', 4, 3, 2014, 7.7, TRUE), \
    (10, 'Inception', 4, 1, 2010, 8.8, TRUE)
---
ok

# Left join.
[plan]> SELECT * FROM movies LEFT JOIN genres ON movies.id = genres.id
---
HashJoin: outer on movies.id = genres.id
├─ Scan: movies
└─ Scan: genres
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, NULL, NULL
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, NULL, NULL
6, 'Solaris', 1, 1, 1972, 8.1, NULL, NULL, NULL
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, NULL, NULL
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, NULL, NULL
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, NULL, NULL
10, 'Inception', 4, 1, 2010, 8.8, TRUE, NULL, NULL

# Right join.
[plan]> SELECT * FROM genres RIGHT JOIN movies ON movies.id = genres.id
---
Remap: genres.id, genres.name, movies.id, movies.title, movies.studio_id, movies.genre_id, movies.released, movies.rating, movies.ultrahd
└─ HashJoin: outer on movies.id = genres.id
   ├─ Scan: movies
   └─ Scan: genres
1, 'Science Fiction', 1, 'Stalker', 1, 1, 1979, 8.2, NULL
2, 'Action', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE
3, 'Comedy', 3, 'Primer', 3, 1, 2004, 6.9, NULL
NULL, NULL, 4, 'Heat', 4, 2, 1995, 8.2, TRUE
NULL, NULL, 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE
NULL, NULL, 6, 'Solaris', 1, 1, 1972, 8.1, NULL
NULL, NULL, 7, 'Gravity', 4, 1, 2013, 7.7, TRUE
NULL, NULL, 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE
NULL, NULL, 9, 'Birdman', 4, 3, 2014, 7.7, TRUE
NULL, NULL, 10, 'Inception', 4, 1, 2010, 8.8, TRUE

# Optional OUTER keyword.
[plan]> SELECT * FROM movies LEFT OUTER JOIN genres ON movies.id = genres.id
---
HashJoin: outer on movies.id = genres.id
├─ Scan: movies
└─ Scan: genres
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, NULL, NULL
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, NULL, NULL
6, 'Solaris', 1, 1, 1972, 8.1, NULL, NULL, NULL
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, NULL, NULL
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, NULL, NULL
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, NULL, NULL
10, 'Inception', 4, 1, 2010, 8.8, TRUE, NULL, NULL

[plan]> SELECT * FROM genres RIGHT OUTER JOIN movies ON movies.id = genres.id
---
Remap: genres.id, genres.name, movies.id, movies.title, movies.studio_id, movies.genre_id, movies.released, movies.rating, movies.ultrahd
└─ HashJoin: outer on movies.id = genres.id
   ├─ Scan: movies
   └─ Scan: genres
1, 'Science Fiction', 1, 'Stalker', 1, 1, 1979, 8.2, NULL
2, 'Action', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE
3, 'Comedy', 3, 'Primer', 3, 1, 2004, 6.9, NULL
NULL, NULL, 4, 'Heat', 4, 2, 1995, 8.2, TRUE
NULL, NULL, 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE
NULL, NULL, 6, 'Solaris', 1, 1, 1972, 8.1, NULL
NULL, NULL, 7, 'Gravity', 4, 1, 2013, 7.7, TRUE
NULL, NULL, 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE
NULL, NULL, 9, 'Birdman', 4, 3, 2014, 7.7, TRUE
NULL, NULL, 10, 'Inception', 4, 1, 2010, 8.8, TRUE

# Truncates when the inner side is shorter.
[plan]> SELECT * FROM genres LEFT JOIN movies ON movies.id = genres.id
---
HashJoin: outer on genres.id = movies.id
├─ Scan: genres
└─ Scan: movies
1, 'Science Fiction', 1, 'Stalker', 1, 1, 1979, 8.2, NULL
2, 'Action', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE
3, 'Comedy', 3, 'Primer', 3, 1, 2004, 6.9, NULL

[plan]> SELECT * FROM movies RIGHT JOIN genres ON movies.id = genres.id
---
Remap: movies.id, movies.title, movies.studio_id, movies.genre_id, movies.released, movies.rating, movies.ultrahd, genres.id, genres.name
└─ HashJoin: outer on genres.id = movies.id
   ├─ Scan: genres
   └─ Scan: movies
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy'

# Arbitrary predicate.
[plan]> SELECT * FROM movies LEFT JOIN genres ON genres.id >= movies.id
---
NestedLoopJoin: outer on genres.id > movies.id OR genres.id = movies.id
├─ Scan: movies
└─ Scan: genres
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 2, 'Action'
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 3, 'Comedy'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 3, 'Comedy'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, NULL, NULL
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, NULL, NULL
6, 'Solaris', 1, 1, 1972, 8.1, NULL, NULL, NULL
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, NULL, NULL
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, NULL, NULL
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, NULL, NULL
10, 'Inception', 4, 1, 2010, 8.8, TRUE, NULL, NULL

# Three-way join.
[plan]> SELECT * FROM studios \
    LEFT JOIN genres ON studios.id = genres.id \
    RIGHT JOIN movies ON movies.id = studios.id
---
Remap: studios.id, studios.name, studios.country_id, genres.id, genres.name, movies.id, movies.title, movies.studio_id, movies.genre_id, movies.released, movies.rating, movies.ultrahd
└─ HashJoin: outer on movies.id = studios.id
   ├─ Scan: movies
   └─ HashJoin: outer on studios.id = genres.id
      ├─ Scan: studios
      └─ Scan: genres
1, 'Mosfilm', 'ru', 1, 'Science Fiction', 1, 'Stalker', 1, 1, 1979, 8.2, NULL
2, 'Lionsgate', 'us', 2, 'Action', 2, 'Sicario', 2, 2, 2015, 7.6, TRUE
3, 'StudioCanal', 'fr', 3, 'Comedy', 3, 'Primer', 3, 1, 2004, 6.9, NULL
4, 'Warner Bros', 'us', NULL, NULL, 4, 'Heat', 4, 2, 1995, 8.2, TRUE
NULL, NULL, NULL, NULL, NULL, 5, 'The Fountain', 4, 1, 2006, 7.2, FALSE
NULL, NULL, NULL, NULL, NULL, 6, 'Solaris', 1, 1, 1972, 8.1, NULL
NULL, NULL, NULL, NULL, NULL, 7, 'Gravity', 4, 1, 2013, 7.7, TRUE
NULL, NULL, NULL, NULL, NULL, 8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE
NULL, NULL, NULL, NULL, NULL, 9, 'Birdman', 4, 3, 2014, 7.7, TRUE
NULL, NULL, NULL, NULL, NULL, 10, 'Inception', 4, 1, 2010, 8.8, TRUE

# Aliased tables.
[plan]> SELECT * FROM movies m LEFT JOIN genres AS g on m.id = g.id
---
HashJoin: outer on m.id = g.id
├─ Scan: movies as m
└─ Scan: genres as g
1, 'Stalker', 1, 1, 1979, 8.2, NULL, 1, 'Science Fiction'
2, 'Sicario', 2, 2, 2015, 7.6, TRUE, 2, 'Action'
3, 'Primer', 3, 1, 2004, 6.9, NULL, 3, 'Comedy'
4, 'Heat', 4, 2, 1995, 8.2, TRUE, NULL, NULL
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE, NULL, NULL
6, 'Solaris', 1, 1, 1972, 8.1, NULL, NULL, NULL
7, 'Gravity', 4, 1, 2013, 7.7, TRUE, NULL, NULL
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE, NULL, NULL
9, 'Birdman', 4, 3, 2014, 7.7, TRUE, NULL, NULL
10, 'Inception', 4, 1, 2010, 8.8, TRUE, NULL, NULL

# Outer joins without ON errors.
!> SELECT * FROM movies LEFT JOIN genres
!> SELECT * FROM movies RIGHT JOIN genres
---
Error: invalid input: unexpected end of input
Error: invalid input: unexpected end of input


================================================
FILE: src/sql/testscripts/queries/limit
================================================
# Tests LIMIT clauses.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
> INSERT INTO test VALUES (1, 'a'), (2, 'b'), (3, 'c')
---
ok

# Test all limits from 4 to 0.
[plan]> SELECT * FROM test LIMIT 4
---
Limit: 4
└─ Scan: test
1, 'a'
2, 'b'
3, 'c'

> SELECT * FROM test LIMIT 3
---
1, 'a'
2, 'b'
3, 'c'

> SELECT * FROM test LIMIT 2
---
1, 'a'
2, 'b'

> SELECT * FROM test LIMIT 1
---
1, 'a'

[plan]> SELECT * FROM test LIMIT 0
---
Nothing

# A max i64 limit works.
> SELECT * FROM test LIMIT 9223372036854775807
---
1, 'a'
2, 'b'
3, 'c'

# Limits can also be used with constant values.
[plan]> SELECT 1, 2, 3 LIMIT 1
---
Limit: 1
└─ Projection: 1, 2, 3
   └─ Values: blank row
1, 2, 3

[plan]> SELECT 1, 2, 3 LIMIT 0
---
Nothing

# Limits can be expressions, but only constant ones.
[plan]> SELECT * FROM test LIMIT 1 + 1
---
Limit: 2
└─ Scan: test
1, 'a'
2, 'b'

!> SELECT * FROM test LIMIT id
---
Error: invalid input: expression must be constant, found column id

# Negative and NULL limits error.
!> SELECT * FROM test LIMIT -1
!> SELECT * FROM test LIMIT NULL
---
Error: invalid input: invalid limit -1
Error: invalid input: invalid limit NULL

# Non-integer limits error.
!> SELECT * FROM test LIMIT FALSE
!> SELECT * FROM test LIMIT 1.0
!> SELECT * FROM test LIMIT '1'
---
Error: invalid input: invalid limit FALSE
Error: invalid input: invalid limit 1.0
Error: invalid input: invalid limit '1'

# Multiple limits error.
!> SELECT * FROM test LIMIT 1 2
!> SELECT * FROM test LIMIT 1,2
---
Error: invalid input: unexpected token 2
Error: invalid input: unexpected token ,


================================================
FILE: src/sql/testscripts/queries/offset
================================================
# Tests OFFSET clauses.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
> INSERT INTO test VALUES (1, 'a'), (2, 'b'), (3, 'c')
---
ok

# Test all offsets from 4 to 0.
[plan]> SELECT * FROM test OFFSET 4
---
Offset: 4
└─ Scan: test

> SELECT * FROM test OFFSET 3
---
ok

> SELECT * FROM test OFFSET 2
---
3, 'c'

> SELECT * FROM test OFFSET 1
---
2, 'b'
3, 'c'

[plan]> SELECT * FROM test OFFSET 0
---
Offset: 0
└─ Scan: test
1, 'a'
2, 'b'
3, 'c'

# A max i64 offset works.
> SELECT * FROM test OFFSET 9223372036854775807
---
ok

# Offsets can also be used with constant values.
[plan]> SELECT 1, 2, 3 OFFSET 1
---
Offset: 1
└─ Projection: 1, 2, 3
   └─ Values: blank row

[plan]> SELECT 1, 2, 3 OFFSET 0
---
Offset: 0
└─ Projection: 1, 2, 3
   └─ Values: blank row
1, 2, 3

# Offsets can be expressions, but only constant ones.
[plan]> SELECT * FROM test OFFSET 2 - 1
---
Offset: 1
└─ Scan: test
2, 'b'
3, 'c'

!> SELECT * FROM test OFFSET id
---
Error: invalid input: expression must be constant, found column id

# Negative and NULL offset error.
!> SELECT * FROM test OFFSET -1
!> SELECT * FROM test OFFSET NULL
---
Error: invalid input: invalid offset -1
Error: invalid input: invalid offset NULL

# Non-integer offsets error.
!> SELECT * FROM test OFFSET FALSE
!> SELECT * FROM test OFFSET 1.0
!> SELECT * FROM test OFFSET '1'
---
Error: invalid input: invalid offset FALSE
Error: invalid input: invalid offset 1.0
Error: invalid input: invalid offset '1'

# Multiple offsets error.
!> SELECT * FROM test OFFSET 1 2
!> SELECT * FROM test OFFSET 1,2
---
Error: invalid input: unexpected token 2
Error: invalid input: unexpected token ,


================================================
FILE: src/sql/testscripts/queries/order
================================================
# Tests ORDER BY clauses.

# Create a table with representative values of all types.
> CREATE TABLE test ( \
    id INT PRIMARY KEY, \
    "bool" BOOLEAN, \
    "int" INTEGER, \
    "float" FLOAT, \
    "string" STRING, \
    static INT \
)
> INSERT INTO test VALUES (0, NULL,  NULL,  NULL,      NULL,  1)
> INSERT INTO test VALUES (1, TRUE,  0,     3.14,      'a',   1)
> INSERT INTO test VALUES (2, FALSE, -1,    -2.718,    'ab',  1)
> INSERT INTO test VALUES (3, NULL,  1,     0.0,       'aaa', 1)
> INSERT INTO test VALUES (4, NULL,  1000,  -0.0,      'A',   1)
> INSERT INTO test VALUES (5, NULL,  -1000, INFINITY,  '',    1)
> INSERT INTO test VALUES (6, NULL,  7,     -INFINITY, 'åa',  1)
> INSERT INTO test VALUES (7, NULL,  -9,    NAN,       'Åa',  1)
> INSERT INTO test VALUES (8, NULL,  NULL,  NULL,      'B',   1)
> INSERT INTO test VALUES (9, NULL,  NULL,  NULL,      '👍',  1)

> CREATE TABLE other (id INT PRIMARY KEY, value STRING)
> INSERT INTO other VALUES (1, 'a'), (2, 'b')
---
ok

# Order by constant values. There's only one row, but it should be valid.
[plan]> SELECT 1 AS value ORDER BY value ASC
---
Order: value asc
└─ Projection: 1 as value
   └─ Values: blank row
1

[plan]> SELECT 1 AS value ORDER BY value DESC
---
Order: value desc
└─ Projection: 1 as value
   └─ Values: blank row
1

# Order by primary key.
[plan]> SELECT * FROM test ORDER BY id ASC
---
Order: test.id asc
└─ Scan: test
0, NULL, NULL, NULL, NULL, 1
1, TRUE, 0, 3.14, 'a', 1
2, FALSE, -1, -2.718, 'ab', 1
3, NULL, 1, 0.0, 'aaa', 1
4, NULL, 1000, 0.0, 'A', 1
5, NULL, -1000, inf, '', 1
6, NULL, 7, -inf, 'åa', 1
7, NULL, -9, NaN, 'Åa', 1
8, NULL, NULL, NULL, 'B', 1
9, NULL, NULL, NULL, '👍', 1

[plan]> SELECT * FROM test ORDER BY id DESC
---
Order: test.id desc
└─ Scan: test
9, NULL, NULL, NULL, '👍', 1
8, NULL, NULL, NULL, 'B', 1
7, NULL, -9, NaN, 'Åa', 1
6, NULL, 7, -inf, 'åa', 1
5, NULL, -1000, inf, '', 1
4, NULL, 1000, 0.0, 'A', 1
3, NULL, 1, 0.0, 'aaa', 1
2, FALSE, -1, -2.718, 'ab', 1
1, TRUE, 0, 3.14, 'a', 1
0, NULL, NULL, NULL, NULL, 1

# Booleans.
> SELECT id, "bool" FROM test ORDER BY "bool" ASC
---
0, NULL
3, NULL
4, NULL
5, NULL
6, NULL
7, NULL
8, NULL
9, NULL
2, FALSE
1, TRUE

> SELECT id, "bool" FROM test ORDER BY "bool" DESC
---
1, TRUE
2, FALSE
0, NULL
3, NULL
4, NULL
5, NULL
6, NULL
7, NULL
8, NULL
9, NULL

# Integers.
> SELECT id, "int" FROM test ORDER BY "int" ASC
---
0, NULL
8, NULL
9, NULL
5, -1000
7, -9
2, -1
1, 0
3, 1
6, 7
4, 1000

> SELECT id, "int" FROM test ORDER BY "int" DESC
---
4, 1000
6, 7
3, 1
1, 0
2, -1
7, -9
5, -1000
0, NULL
8, NULL
9, NULL

# Floats.
> SELECT id, "float" FROM test ORDER BY "float" ASC
---
0, NULL
8, NULL
9, NULL
6, -inf
2, -2.718
3, 0.0
4, 0.0
1, 3.14
5, inf
7, NaN

> SELECT id, "float" FROM test ORDER BY "float" DESC
---
7, NaN
5, inf
1, 3.14
3, 0.0
4, 0.0
2, -2.718
6, -inf
0, NULL
8, NULL
9, NULL

# Strings.
> SELECT id, "string" FROM test ORDER BY "string" ASC
---
0, NULL
5, ''
4, 'A'
8, 'B'
1, 'a'
3, 'aaa'
2, 'ab'
7, 'Åa'
6, 'åa'
9, '👍'

> SELECT id, "string" FROM test ORDER BY "string" DESC
---
9, '👍'
6, 'åa'
7, 'Åa'
2, 'ab'
3, 'aaa'
1, 'a'
8, 'B'
4, 'A'
5, ''
0, NULL

# When all values are equal, they are stably ordered by the primary key in
# ascending order (the scan order).
> SELECT id, static FROM test ORDER BY static ASC
---
0, 1
1, 1
2, 1
3, 1
4, 1
5, 1
6, 1
7, 1
8, 1
9, 1


> SELECT id, static FROM test ORDER BY static DESC
---
0, 1
1, 1
2, 1
3, 1
4, 1
5, 1
6, 1
7, 1
8, 1
9, 1

# Order by multiple columns. Again, the ascending primary key is tiebreaker.
> SELECT id, static, "bool", "int", "string" FROM test \
  ORDER BY static ASC, "bool" DESC, "int" ASC, "string" DESC
---
1, 1, TRUE, 0, 'a'
2, 1, FALSE, -1, 'ab'
9, 1, NULL, NULL, '👍'
8, 1, NULL, NULL, 'B'
0, 1, NULL, NULL, NULL
5, 1, NULL, -1000, ''
7, 1, NULL, -9, 'Åa'
3, 1, NULL, 1, 'aaa'
6, 1, NULL, 7, 'åa'
4, 1, NULL, 1000, 'A'

> SELECT id, static, "bool", "int", "string" FROM test \
  ORDER BY static DESC, "bool" ASC, "int" DESC, "string" ASC
---
4, 1, NULL, 1000, 'A'
6, 1, NULL, 7, 'åa'
3, 1, NULL, 1, 'aaa'
7, 1, NULL, -9, 'Åa'
5, 1, NULL, -1000, ''
0, 1, NULL, NULL, NULL
8, 1, NULL, NULL, 'B'
9, 1, NULL, NULL, '👍'
2, 1, FALSE, -1, 'ab'
1, 1, TRUE, 0, 'a'

# Can order by expressions.
[plan]> SELECT id, "float" FROM test ORDER BY "float" ^ 2
---
Order: test.float ^ 2 asc
└─ Projection: test.id, test.float
   └─ Scan: test
0, NULL
8, NULL
9, NULL
3, 0.0
4, 0.0
2, -2.718
1, 3.14
5, inf
6, -inf
7, NaN

# Can order by columns not in the result. Multiple references to the same column
# only result in one hidden column.
[plan]> SELECT id, "int" FROM test ORDER BY "bool" DESC
---
Remap: test.id, test.int (dropped: test.bool)
└─ Order: test.bool desc
   └─ Projection: test.id, test.int, test.bool
      └─ Scan: test
1, 0
2, -1
0, NULL
3, 1
4, 1000
5, -1000
6, 7
7, -9
8, NULL
9, NULL

[plan]> SELECT id, "int" FROM test ORDER BY "bool" DESC, "bool" ASC
---
Remap: test.id, test.int (dropped: test.bool)
└─ Order: test.bool desc, test.bool asc
   └─ Projection: test.id, test.int, test.bool
      └─ Scan: test
1, 0
2, -1
0, NULL
3, 1
4, 1000
5, -1000
6, 7
7, -9
8, NULL
9, NULL

# Can order on expressions on columns not in the result.
[plan]> SELECT id FROM test ORDER BY "float" ^ 2 - "int" ^ 2 DESC
---
Remap: test.id (dropped: test.float, test.int)
└─ Order: test.float ^ 2 - test.int ^ 2 desc
   └─ Projection: test.id, test.float, test.int
      └─ Scan: test
7
5
6
1
2
3
4
0
8
9

# Order by aliased table or column.
> SELECT id, "int" AS foo FROM test ORDER BY foo
---
0, NULL
8, NULL
9, NULL
5, -1000
7, -9
2, -1
1, 0
3, 1
6, 7
4, 1000

> SELECT id, "int" FROM test AS t ORDER BY t."int"
---
0, NULL
8, NULL
9, NULL
5, -1000
7, -9
2, -1
1, 0
3, 1
6, 7
4, 1000

# Order by an aliased expression.
> SELECT id, "int" ^ 2 AS square FROM test ORDER BY square ASC
---
0, NULL
8, NULL
9, NULL
1, 0
2, 1
3, 1
6, 49
7, 81
4, 1000000
5, 1000000

# Errors if the column is ambiguous.
!> SELECT id, "int" ^ 2 AS foo, "int" AS foo FROM test ORDER BY foo ASC
---
Error: invalid input: ambiguous column foo

# Prefers alias over table column if ambiguous, but not if fully qualified.
[plan]> SELECT id AS "int" FROM test ORDER BY "int" DESC
---
Order: int desc
└─ Projection: test.id as int
   └─ Scan: test
9
8
7
6
5
4
3
2
1
0

[plan]> SELECT id AS "int" FROM test ORDER BY test."int" DESC
---
Remap: int (dropped: test.int)
└─ Order: test.int desc
   └─ Projection: test.id as int, test.int
      └─ Scan: test
4
6
3
1
2
7
5
0
8
9

# Errors on unknown table or column, even the original table name when aliased.
!> SELECT * FROM test ORDER BY unknown
!> SELECT * FROM test ORDER BY test.unknown
!> SELECT * FROM test ORDER BY unknown.id
!> SELECT * FROM test AS t ORDER BY test."int"
---
Error: invalid input: unknown column unknown
Error: invalid input: unknown column test.unknown
Error: invalid input: unknown table unknown
Error: invalid input: unknown table test

# Errors on unknown direction.
!> SELECT * FROM test ORDER BY id UNKNOWN
---
Error: invalid input: unexpected token unknown

# Errors on trailing comma.
!> SELECT * FROM test ORDER BY id,
---
Error: invalid input: unexpected end of input

# Errors on ambiguous columns.
!> SELECT * FROM test, other ORDER BY id DESC
---
Error: invalid input: ambiguous column id

# Works with qualified columns, even when aliased.
[plan]> SELECT * FROM test, other ORDER BY other.id DESC
---
Order: other.id desc
└─ NestedLoopJoin: inner
   ├─ Scan: test
   └─ Scan: other
0, NULL, NULL, NULL, NULL, 1, 2, 'b'
1, TRUE, 0, 3.14, 'a', 1, 2, 'b'
2, FALSE, -1, -2.718, 'ab', 1, 2, 'b'
3, NULL, 1, 0.0, 'aaa', 1, 2, 'b'
4, NULL, 1000, 0.0, 'A', 1, 2, 'b'
5, NULL, -1000, inf, '', 1, 2, 'b'
6, NULL, 7, -inf, 'åa', 1, 2, 'b'
7, NULL, -9, NaN, 'Åa', 1, 2, 'b'
8, NULL, NULL, NULL, 'B', 1, 2, 'b'
9, NULL, NULL, NULL, '👍', 1, 2, 'b'
0, NULL, NULL, NULL, NULL, 1, 1, 'a'
1, TRUE, 0, 3.14, 'a', 1, 1, 'a'
2, FALSE, -1, -2.718, 'ab', 1, 1, 'a'
3, NULL, 1, 0.0, 'aaa', 1, 1, 'a'
4, NULL, 1000, 0.0, 'A', 1, 1, 'a'
5, NULL, -1000, inf, '', 1, 1, 'a'
6, NULL, 7, -inf, 'åa', 1, 1, 'a'
7, NULL, -9, NaN, 'Åa', 1, 1, 'a'
8, NULL, NULL, NULL, 'B', 1, 1, 'a'
9, NULL, NULL, NULL, '👍', 1, 1, 'a'

[plan]> SELECT * FROM test t, other o ORDER BY o.id DESC, t.id ASC
---
Order: o.id desc, t.id asc
└─ NestedLoopJoin: inner
   ├─ Scan: test as t
   └─ Scan: other as o
0, NULL, NULL, NULL, NULL, 1, 2, 'b'
1, TRUE, 0, 3.14, 'a', 1, 2, 'b'
2, FALSE, -1, -2.718, 'ab', 1, 2, 'b'
3, NULL, 1, 0.0, 'aaa', 1, 2, 'b'
4, NULL, 1000, 0.0, 'A', 1, 2, 'b'
5, NULL, -1000, inf, '', 1, 2, 'b'
6, NULL, 7, -inf, 'åa', 1, 2, 'b'
7, NULL, -9, NaN, 'Åa', 1, 2, 'b'
8, NULL, NULL, NULL, 'B', 1, 2, 'b'
9, NULL, NULL, NULL, '👍', 1, 2, 'b'
0, NULL, NULL, NULL, NULL, 1, 1, 'a'
1, TRUE, 0, 3.14, 'a', 1, 1, 'a'
2, FALSE, -1, -2.718, 'ab', 1, 1, 'a'
3, NULL, 1, 0.0, 'aaa', 1, 1, 'a'
4, NULL, 1000, 0.0, 'A', 1, 1, 'a'
5, NULL, -1000, inf, '', 1, 1, 'a'
6, NULL, 7, -inf, 'åa', 1, 1, 'a'
7, NULL, -9, NaN, 'Åa', 1, 1, 'a'
8, NULL, NULL, NULL, 'B', 1, 1, 'a'
9, NULL, NULL, NULL, '👍', 1, 1, 'a'

# Order by aggregates, both when in SELECT and otherwise.
[plan]> SELECT "bool", MAX("int") FROM test GROUP BY "bool" ORDER BY MAX("int") DESC
---
Order: #1 desc
└─ Aggregate: test.bool, max(test.int)
   └─ Scan: test
NULL, 1000
TRUE, 0
FALSE, -1

[plan]> SELECT "bool" FROM test GROUP BY "bool" ORDER BY MAX("int") DESC
---
Remap: test.bool (dropped: #1)
└─ Order: #1 desc
   └─ Aggregate: test.bool, max(test.int)
      └─ Scan: test
NULL
TRUE
FALSE

[plan]> SELECT "bool", MAX("int") FROM test GROUP BY "bool" ORDER BY MAX("int") - MIN("int") DESC
---
Remap: test.bool, #1 (dropped: #2)
└─ Order: #1 - #2 desc
   └─ Aggregate: test.bool, max(test.int), min(test.int)
      └─ Scan: test
NULL, 1000
FALSE, -1
TRUE, 0

# ORDER BY works with compound expressions using complex GROUP BY expressions
# that are not on the SELECT clause.
[plan]> SELECT COUNT(*) FROM test GROUP BY id % 2 ORDER BY 2 - id % 2 + 1 > 1
---
Remap: #0 (dropped: #1)
└─ Order: 2 - #1 + 1 > 1 asc
   └─ Projection: #1, #0
      └─ Aggregate: test.id % 2, count(TRUE)
         └─ Scan: test
5
5

# ORDER BY can use (un)qualified expressions for an (un)qualified GROUP BY.
[plan]> SELECT COUNT(*) FROM test GROUP BY "bool" ORDER BY test."bool"
---
Remap: #0 (dropped: test.bool)
└─ Order: test.bool asc
   └─ Projection: #1, test.bool
      └─ Aggregate: test.bool, count(TRUE)
         └─ Scan: test
8
1
1

[plan]> SELECT COUNT(*) FROM test GROUP BY test."bool" ORDER BY "bool"
---
Remap: #0 (dropped: test.bool)
└─ Order: test.bool asc
   └─ Projection: #1, test.bool
      └─ Aggregate: test.bool, count(TRUE)
         └─ Scan: test
8
1
1

# ORDER BY errors on columns not in the SELECT or GROUP BY clauses.
!> SELECT "bool", COUNT(*) FROM test GROUP BY "bool" ORDER BY id
---
Error: invalid input: column id must be used in an aggregate or GROUP BY expression


================================================
FILE: src/sql/testscripts/queries/select
================================================
# Tests the SELECT part of queries.

# Create a basic test table, and a secondary table for join column lookups.
> CREATE TABLE test ( \
    id INT PRIMARY KEY, \
    "bool" BOOLEAN, \
    "float" FLOAT, \
    "int" INT, \
    "string" STRING \
)
> INSERT INTO test VALUES (1, true, 3.14, 7, 'foo')
> INSERT INTO test VALUES (2, false, 2.718, 1, '👍')
> INSERT INTO test VALUES (3, NULL, NULL, NULL, NULL)

> CREATE TABLE other (id INT PRIMARY KEY, value STRING)
> INSERT INTO other VALUES (1, 'a'), (2, 'b')
---
ok

# Select constant values.
[plan]> select 1
---
Projection: 1
└─ Values: blank row
1

[plan]> SELECT NULL, NOT FALSE, 2^2+1, 3.14*2, 'Hi 👋'
---
Projection: NULL, TRUE, 5, 6.28, 'Hi 👋'
└─ Values: blank row
NULL, TRUE, 5, 6.28, 'Hi 👋'

# Bare select errors, as does trailing comma and identifier without a table.
!> SELECT
!> SELECT 1,
!> SELECT foo
---
Error: invalid input: unexpected end of input
Error: invalid input: unexpected end of input
Error: invalid input: expression must be constant, found column foo

# Select from a table.
[plan,header]> SELECT * FROM test
---
Scan: test
test.id, test.bool, test.float, test.int, test.string
1, TRUE, 3.14, 7, 'foo'
2, FALSE, 2.718, 1, '👍'
3, NULL, NULL, NULL, NULL

[plan,header]> SELECT "bool" FROM test
---
Projection: test.bool
└─ Scan: test
test.bool
TRUE
FALSE
NULL

# * can't be used with table names, for simplicity.
!> SELECT test.* FROM test
---
Error: invalid input: expected identifier, got *

# A SELECT * without a table errors, as does a bare FROM.
!> SELECT *
!> SELECT * FROM
---
Error: invalid input: SELECT * requires a FROM clause
Error: invalid input: unexpected end of input

# A * errors in expressions. For simplicity, expressions only support scalars.
!> SELECT 1 + * FROM test
!> SELECT sqrt(*) FROM test
!> SELECT max(*) FROM test
---
Error: invalid input: unsupported use of *
Error: invalid input: unsupported use of *
Error: invalid input: unsupported use of *

# A * can be used multiple times.
[plan,header]> SELECT *, *, * FROM test
---
Projection: test.id, test.bool, test.float, test.int, test.string, test.id, test.bool, test.float, test.int, test.string, test.id, test.bool, test.float, test.int, test.string
└─ Scan: test
test.id, test.bool, test.float, test.int, test.string, test.id, test.bool, test.float, test.int, test.string, test.id, test.bool, test.float, test.int, test.string
1, TRUE, 3.14, 7, 'foo', 1, TRUE, 3.14, 7, 'foo', 1, TRUE, 3.14, 7, 'foo'
2, FALSE, 2.718, 1, '👍', 2, FALSE, 2.718, 1, '👍', 2, FALSE, 2.718, 1, '👍'
3, NULL, NULL, NULL, NULL, 3, NULL, NULL, NULL, NULL, 3, NULL, NULL, NULL, NULL

# Mix *, columns, column expressions, and constant expressions.
[plan,header]> SELECT id, 7-4, *, "float"^2 FROM test
---
Projection: test.id, 3, test.id, test.bool, test.float, test.int, test.string, test.float ^ 2
└─ Scan: test
test.id, , test.id, test.bool, test.float, test.int, test.string, 
1, 3, 1, TRUE, 3.14, 7, 'foo', 9.8596
2, 3, 2, FALSE, 2.718, 1, '👍', 7.387524
3, 3, 3, NULL, NULL, NULL, NULL, NULL

# Column names may be qualified or unqualified.
[header]> SELECT id, test."bool" FROM test
---
test.id, test.bool
1, TRUE
2, FALSE
3, NULL

# The table may be aliased, and qualified using the alias. The AS alias keyword
# is optional.
[header,plan]> SELECT id, t."bool" FROM test AS t
---
Projection: t.id, t.bool
└─ Scan: test as t
t.id, t.bool
1, TRUE
2, FALSE
3, NULL

[header]> SELECT id, t."bool" FROM test t
---
t.id, t.bool
1, TRUE
2, FALSE
3, NULL

# Unknown tables or columns error. Including the original table when aliased.
!> SELECT * FROM unknown
!> SELECT unknown FROM test
!> SELECT test.unknown FROM test
!> SELECT test.id.unknown FROM test
!> SELECT unknown.id FROM test
!> SELECT test.id FROM test AS t
---
Error: invalid input: table unknown does not exist
Error: invalid input: unknown column unknown
Error: invalid input: unknown column test.unknown
Error: invalid input: unexpected token .
Error: invalid input: unknown table unknown
Error: invalid input: unknown table test

# Columns, both constant and from tables, can be aliased.
# The AS keyword is optional.
[header,plan]> SELECT 1 AS one, test."int" value FROM test
---
Projection: 1 as one, test.int as value
└─ Scan: test
one, value
1, 7
1, 1
1, NULL

# Aliases can have special characters and keywords if quoted.
[header]> SELECT 1 AS "integer", 2 AS "hi 👋"
---
integer, hi 👋
1, 2

# Expressions can't reference aliases.
!> SELECT 1 AS one, one + 1
!> SELECT id AS alias, alias + 1 FROM test
---
Error: invalid input: expression must be constant, found column one
Error: invalid input: unknown column alias

# Aliases can have the same name as table columns, but won't shadow them.
[header]> SELECT 'foo' AS id, id, id + 3 FROM test
---
id, test.id, 
'foo', 1, 4
'foo', 2, 5
'foo', 3, 6

# Multiple aliases can have the same name.
[header]> SELECT 1 AS id, id, "float" AS id FROM test
---
id, test.id, id
1, 1, 3.14
1, 2, 2.718
1, 3, NULL

# Aliases can't be qualified.
!> SELECT 1 AS foo.bar
---
Error: invalid input: unexpected token .

# Bare and * aliases error.
!> SELECT 1 AS
!> SELECT * AS all FROM test
---
Error: invalid input: unexpected end of input
Error: invalid input: can't alias *

# Ambiguous columns error.
!> SELECT id FROM test, other
---
Error: invalid input: ambiguous column id

# Unambiguous columns don't, resulting in a cross join.
> SELECT "bool", value FROM test, other
---
TRUE, 'a'
TRUE, 'b'
FALSE, 'a'
FALSE, 'b'
NULL, 'a'
NULL, 'b'

# Qualified columns work, also when aliased.
> SELECT test.id, other.id FROM test, other
---
1, 1
1, 2
2, 1
2, 2
3, 1
3, 2

> SELECT t.id, o.id FROM test t, other o
---
1, 1
1, 2
2, 1
2, 2
3, 1
3, 2

# A select with no rows optimized to a Nothing node still emits headers.
[plan,header]> SELECT * FROM test WHERE FALSE
---
Nothing
test.id, test.bool, test.float, test.int, test.string


================================================
FILE: src/sql/testscripts/queries/where_
================================================
# Tests basic WHERE clauses.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
> INSERT INTO test VALUES (1, 'a'), (2, 'b'), (3, 'c')

> CREATE TABLE other (id INT PRIMARY KEY, "bool" BOOLEAN)
> INSERT INTO other VALUES (1, FALSE), (2, TRUE)
---
ok

# Constant TRUE and FALSE filters work as expected.
[plan]> SELECT * FROM test WHERE TRUE
---
Scan: test
1, 'a'
2, 'b'
3, 'c'

[plan]> SELECT * FROM test WHERE FALSE
---
Nothing

# NULL is treated as FALSE.
[plan]> SELECT * FROM test WHERE NULL
---
Nothing

# Field predicate expressions work as expected.
[plan]> SELECT * FROM test WHERE id > 1
---
Scan: test (test.id > 1)
2, 'b'
3, 'c'

[plan]> SELECT * FROM test WHERE id > 1 AND value < 'c'
---
Scan: test (test.id > 1 AND test.value < 'c')
2, 'b'

# Errors on non-boolean type.
!> SELECT * FROM test WHERE 1
!> SELECT * FROM test WHERE 1.0
!> SELECT * FROM test WHERE ''
---
Error: invalid input: filter returned 1, expected boolean
Error: invalid input: filter returned 1.0, expected boolean
Error: invalid input: filter returned '', expected boolean

# Errors on bare WHERE clause or multiple predicates.
!> SELECT * FROM test WHERE
!> SELECT * FROM test WHERE TRUE, TRUE
---
Error: invalid input: unexpected end of input
Error: invalid input: unexpected token ,

# Errors on unknown tables and columns.
!> SELECT * FROM test WHERE unknown > 0
!> SELECT * FROM test WHERE unknown.id > 0
---
Error: invalid input: unknown column unknown
Error: invalid input: unknown table unknown

# Qualified names are valid.
> SELECT * FROM test WHERE test.value = 'b'
---
2, 'b'

# Expression and column aliases aren't visible.
!> SELECT value AS v FROM test WHERE v = 'b'
!> SELECT 1 + 1 AS two WHERE two = 2
---
Error: invalid input: unknown column v
Error: invalid input: expression must be constant, found column two

# Table aliases are visible.
> SELECT * FROM test AS t WHERE t.id = 2
---
2, 'b'

# Ambiguous columns error.
!> SELECT * FROM test, other WHERE id > 1
---
Error: invalid input: ambiguous column id

# Unambiguous columns work.
> SELECT * FROM test, other WHERE value = 'b'
---
2, 'b', 1, FALSE
2, 'b', 2, TRUE


# Qualified columns work, also when aliased.
> SELECT * FROM test, other WHERE test.id = 2 AND other.id = 2
---
2, 'b', 2, TRUE

> SELECT * FROM test t, other o WHERE t.id = 2 AND o.id = 2
---
2, 'b', 2, TRUE

# WHERE can be combined with joins, even when aliased.
[plan]> SELECT * FROM test JOIN other ON test.id = other.id WHERE test.id > 1
---
HashJoin: inner on test.id = other.id
├─ Scan: test (test.id > 1)
└─ Scan: other
2, 'b', 2, TRUE

[plan]> SELECT * FROM test t JOIN other o ON t.id = o.id WHERE t.id > 1
---
HashJoin: inner on t.id = o.id
├─ Scan: test as t (t.id > 1)
└─ Scan: other as o
2, 'b', 2, TRUE


================================================
FILE: src/sql/testscripts/queries/where_index
================================================
# Tests WHERE index lookups.

# Create a table with representative values of all types.
> CREATE TABLE test ( \
    id INT PRIMARY KEY, \
    "bool" BOOLEAN INDEX, \
    "int" INTEGER INDEX, \
    "float" FLOAT INDEX, \
    "string" STRING INDEX \
)
> INSERT INTO test VALUES (0, NULL,  NULL,  NULL,      NULL)
> INSERT INTO test VALUES (1, TRUE,  0,     3.14,      'abc')
> INSERT INTO test VALUES (2, FALSE, -1,    -2.718,    'a')
> INSERT INTO test VALUES (3, TRUE,  1,     0.0,       'ABC')
> INSERT INTO test VALUES (4, NULL,  1,     -0.0,      '👍')
> INSERT INTO test VALUES (5, NULL,  NULL,  INFINITY,  'å')
> INSERT INTO test VALUES (6, NULL,  NULL,  NAN,       '')
---
ok

# Boolean lookups.
[plan]> SELECT * FROM test WHERE "bool" = TRUE
---
IndexLookup: test.bool (TRUE)
1, TRUE, 0, 3.14, 'abc'
3, TRUE, 1, 0.0, 'ABC'

[plan]> SELECT * FROM test WHERE "bool" = FALSE
---
IndexLookup: test.bool (FALSE)
2, FALSE, -1, -2.718, 'a'

# Integer lookups, including multiple matches and missing values.
[plan]> SELECT * FROM test WHERE "int" = -1
---
IndexLookup: test.int (-1)
2, FALSE, -1, -2.718, 'a'

[plan]> SELECT * FROM test WHERE "int" = 0
---
IndexLookup: test.int (0)
1, TRUE, 0, 3.14, 'abc'

[plan]> SELECT * FROM test WHERE "int" = 1
---
IndexLookup: test.int (1)
3, TRUE, 1, 0.0, 'ABC'
4, NULL, 1, 0.0, '👍'

[plan]> SELECT * FROM test WHERE "int" = 7
---
IndexLookup: test.int (7)

# Floats. 0.0 and -0.0 should be equal. NAN should be unequal,
# but IS NAN should yield lookups.
[plan]> SELECT * FROM test WHERE "float" = -2.718
---
IndexLookup: test.float (-2.718)
2, FALSE, -1, -2.718, 'a'

[plan]> SELECT * FROM test WHERE "float" = -0.0
---
IndexLookup: test.float (-0.0)
3, TRUE, 1, 0.0, 'ABC'
4, NULL, 1, 0.0, '👍'

[plan]> SELECT * FROM test WHERE "float" = 0.0
---
IndexLookup: test.float (0.0)
3, TRUE, 1, 0.0, 'ABC'
4, NULL, 1, 0.0, '👍'

[plan]> SELECT * FROM test WHERE "float" = 3.14
---
IndexLookup: test.float (3.14)
1, TRUE, 0, 3.14, 'abc'

[plan]> SELECT * FROM test WHERE "float" = INFINITY
---
IndexLookup: test.float (inf)
5, NULL, NULL, inf, 'å'

[plan]> SELECT * FROM test WHERE "float" = NAN
---
Nothing

[plan]> SELECT * FROM test WHERE "float" = -NAN
---
Nothing

[plan]> SELECT * FROM test WHERE "float" IS NAN
---
IndexLookup: test.float (NaN)
6, NULL, NULL, NaN, ''

# Strings. Should be case-insensitive.
[plan]> SELECT * FROM test WHERE "string" = 'abc'
---
IndexLookup: test.string ('abc')
1, TRUE, 0, 3.14, 'abc'

[plan]> SELECT * FROM test WHERE "string" = 'a'
---
IndexLookup: test.string ('a')
2, FALSE, -1, -2.718, 'a'

[plan]> SELECT * FROM test WHERE "string" = 'å'
---
IndexLookup: test.string ('å')
5, NULL, NULL, inf, 'å'

[plan]> SELECT * FROM test WHERE "string" = '👍'
---
IndexLookup: test.string ('👍')
4, NULL, 1, 0.0, '👍'

[plan]> SELECT * FROM test WHERE "string" = ''
---
IndexLookup: test.string ('')
6, NULL, NULL, NaN, ''

# LIKE does not use an index.
[plan]> SELECT * FROM test WHERE "string" LIKE 'a%'
---
Scan: test (test.string LIKE 'a%')
1, TRUE, 0, 3.14, 'abc'
2, FALSE, -1, -2.718, 'a'

# IS NULL lookups should use an index. = NULL should give no matches.
[plan]> SELECT * FROM test WHERE "int" IS NULL
---
IndexLookup: test.int (NULL)
0, NULL, NULL, NULL, NULL
5, NULL, NULL, inf, 'å'
6, NULL, NULL, NaN, ''

[plan]> SELECT * FROM test WHERE "int" = NULL
---
Nothing

# Multiple lookups work and use the index.
[plan]> SELECT * FROM test WHERE "int" = -1 OR "int" = 0 OR "int" = 1 OR "int" = 7
---
IndexLookup: test.int (-1, 0, 1, 7)
1, TRUE, 0, 3.14, 'abc'
2, FALSE, -1, -2.718, 'a'
3, TRUE, 1, 0.0, 'ABC'
4, NULL, 1, 0.0, '👍'

# > or < predicates don't use an index.
[plan]> SELECT * FROM test WHERE "int" < 1
---
Scan: test (test.int < 1)
1, TRUE, 0, 3.14, 'abc'
2, FALSE, -1, -2.718, 'a'

[plan]> SELECT * FROM test WHERE "int" > -1
---
Scan: test (test.int > -1)
1, TRUE, 0, 3.14, 'abc'
3, TRUE, 1, 0.0, 'ABC'
4, NULL, 1, 0.0, '👍'


================================================
FILE: src/sql/testscripts/queries/where_primary_key
================================================
# Tests WHERE index lookups.

# Boolean lookups.
> CREATE TABLE "bool" (id BOOL PRIMARY KEY)
> INSERT INTO "bool" VALUES (TRUE), (FALSE)
---
ok

[plan]> SELECT * FROM "bool" WHERE id = TRUE
---
KeyLookup: bool (TRUE)
TRUE

[plan]> SELECT * FROM "bool" WHERE id = FALSE
---
KeyLookup: bool (FALSE)
FALSE

# Integer lookups, including a missing value.
> CREATE TABLE "int" (id INT PRIMARY KEY)
> INSERT INTO "int" VALUES (-1), (0), (1)
---
ok

[plan]> SELECT * FROM "int" WHERE id = -1
---
KeyLookup: int (-1)
-1

[plan]> SELECT * FROM "int" WHERE id = 0
---
KeyLookup: int (0)
0

[plan]> SELECT * FROM "int" WHERE id = 1
---
KeyLookup: int (1)
1

[plan]> SELECT * FROM "int" WHERE id = 7
---
KeyLookup: int (7)

# Floats. NAN matches fail (and aren't valid primary keys anyway).
# 0.0 and -0.0 should be considered equal.
> CREATE TABLE "float" (id FLOAT PRIMARY KEY)
> INSERT INTO "float" VALUES (-2.718), (-0.0), (3.14), (INFINITY)

[plan]> SELECT * FROM "float" WHERE id = -2.718
---
KeyLookup: float (-2.718)
-2.718

[plan]> SELECT * FROM "float" WHERE id = -0.0
---
KeyLookup: float (-0.0)
0.0

[plan]> SELECT * FROM "float" WHERE id = 0.0
---
KeyLookup: float (0.0)
0.0

[plan]> SELECT * FROM "float" WHERE id = 3.14
---
KeyLookup: float (3.14)
3.14

[plan]> SELECT * FROM "float" WHERE id = INFINITY
---
KeyLookup: float (inf)
inf

[plan]> SELECT * FROM "float" WHERE id = NAN
---
Nothing

[plan]> SELECT * FROM "float" WHERE id IS NAN
---
KeyLookup: float (NaN)

# Strings. Should be case-insensitive.
> CREATE TABLE "string" (id STRING PRIMARY KEY)
> INSERT INTO "string" VALUES (''), ('a'), ('å'), ('abc'), ('ABC'), ('👍')

[plan]> SELECT * FROM "string" WHERE id = ''
---
KeyLookup: string ('')
''

[plan]> SELECT * FROM "string" WHERE id = 'a'
---
KeyLookup: string ('a')
'a'

[plan]> SELECT * FROM "string" WHERE id = 'å'
---
KeyLookup: string ('å')
'å'

[plan]> SELECT * FROM "string" WHERE id = 'abc'
---
KeyLookup: string ('abc')
'abc'

[plan]> SELECT * FROM "string" WHERE id = '👍'
---
KeyLookup: string ('👍')
'👍'

# LIKE does not use an index.
[plan]> SELECT * FROM "string" WHERE id LIKE 'a%'
---
Scan: string (string.id LIKE 'a%')
'a'
'abc'

# NULL lookups should be legal but give no matches.
[plan]> SELECT * FROM "int" WHERE id = NULL
---
Nothing

[plan]> SELECT * FROM "int" WHERE id IS NULL
---
KeyLookup: int (NULL)

# Multiple lookups work.
[plan]> SELECT * FROM "int" WHERE id = -1 OR id = 0 OR id = 1 OR id = 7
---
KeyLookup: int (-1, 0, 1, 7)
-1
0
1

# > or < predicates don't use an index.
[plan]> SELECT * FROM "int" WHERE id < 1
---
Scan: int (int.id < 1)
-1
0

[plan]> SELECT * FROM "int" WHERE id > -1
---
Scan: int (int.id > -1)
0
1


================================================
FILE: src/sql/testscripts/schema/create_table
================================================
# Tests basic CREATE TABLE functionality.

# The result contains the table name. The table is written to storage. Also
# output the plan, which is trivial.
[plan,result,ops]> CREATE TABLE test (id INTEGER PRIMARY KEY)
---
CreateTable: test
set mvcc:NextVersion → 2 ["\x00" → "\x02"]
set mvcc:TxnActive(1) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x01" → ""]
set mvcc:TxnWrite(1, sql:Table(test)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\xfftest\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Table(test), 1) → CREATE TABLE test ( id INTEGER PRIMARY KEY ) ["\x04\x00\xfftest\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x10\x04test\x00\x01\x02id\x01\x00\x00\x01\x00\x00"]
delete mvcc:TxnWrite(1, sql:Table(test)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\xfftest\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnActive(1) ["\x01\x00\x00\x00\x00\x00\x00\x00\x01"]
CreateTable { name: "test" }

dump
---
mvcc:NextVersion → 2 ["\x00" → "\x02"]
mvcc:Version(sql:Table(test), 1) → CREATE TABLE test ( id INTEGER PRIMARY KEY ) ["\x04\x00\xfftest\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x10\x04test\x00\x01\x02id\x01\x00\x00\x01\x00\x00"]

# Errors if table already exists.
!> CREATE TABLE test (id INTEGER PRIMARY KEY)
---
Error: invalid input: table test already exists

# No table name or columns errors.
!> CREATE TABLE
!> CREATE TABLE name
!> CREATE TABLE name ()
---
Error: invalid input: unexpected end of input
Error: invalid input: unexpected end of input
Error: invalid input: expected identifier, got )

# Missing table or column names error.
!> CREATE TABLE (id INTEGER PRIMARY KEY)
!> CREATE TABLE name (INTEGER PRIMARY KEY)
---
Error: invalid input: expected identifier, got (
Error: invalid input: expected identifier, got INTEGER

# Unterminated identifier errors.
!> CREATE TABLE "name (id INTEGER PRIMARY KEY)
---
Error: invalid input: unexpected end of quoted identifier


================================================
FILE: src/sql/testscripts/schema/create_table_datatypes
================================================
# Tests CREATE TABLE datatypes.

# Create columns with all datatypes.
> CREATE TABLE datatypes ( \
    id INTEGER PRIMARY KEY, \
    "bool" BOOL, \
    "boolean" BOOLEAN, \
    "double" DOUBLE, \
    "float" FLOAT, \
    "int" INT, \
    "integer" INTEGER, \
    "string" STRING, \
    "text" TEXT, \
    "varchar" VARCHAR \
)
schema
---
CREATE TABLE datatypes (
  id INTEGER PRIMARY KEY,
  "bool" BOOLEAN DEFAULT NULL,
  "boolean" BOOLEAN DEFAULT NULL,
  "double" FLOAT DEFAULT NULL,
  "float" FLOAT DEFAULT NULL,
  "int" INTEGER DEFAULT NULL,
  "integer" INTEGER DEFAULT NULL,
  "string" STRING DEFAULT NULL,
  "text" STRING DEFAULT NULL,
  "varchar" STRING DEFAULT NULL
)

# Missing or unknown datatype errors.
!> CREATE TABLE test (id INTEGER PRIMARY KEY, value)
!> CREATE TABLE test (id INTEGER PRIMARY KEY, value FOO)
!> CREATE TABLE test (id INTEGER PRIMARY KEY, value INDEX)
---
Error: invalid input: unexpected token )
Error: invalid input: unexpected token foo
Error: invalid input: unexpected token INDEX


================================================
FILE: src/sql/testscripts/schema/create_table_default
================================================
# Tests column defaults.

# All datatypes.
> CREATE TABLE datatypes ( \
    id INT PRIMARY KEY, \
    "bool" BOOLEAN DEFAULT true, \
    "float" FLOAT DEFAULT 3.14, \
    "int" INTEGER DEFAULT 7, \
    "string" STRING DEFAULT 'foo' \
)
schema datatypes
---
CREATE TABLE datatypes (
  id INTEGER PRIMARY KEY,
  "bool" BOOLEAN DEFAULT TRUE,
  "float" FLOAT DEFAULT 3.14,
  "int" INTEGER DEFAULT 7,
  "string" STRING DEFAULT 'foo'
)

# Default datatypes must match column. This includes float/integer types.
!> CREATE TABLE name (id INT PRIMARY KEY, value STRING DEFAULT 7)
!> CREATE TABLE name (id INT PRIMARY KEY, value INTEGER DEFAULT 3.14)
!> CREATE TABLE name (id INT PRIMARY KEY, value FLOAT DEFAULT 7)
---
Error: invalid input: invalid default type INTEGER for STRING column value
Error: invalid input: invalid default type FLOAT for INTEGER column value
Error: invalid input: invalid default type INTEGER for FLOAT column value

# Default values can be expressions.
> CREATE TABLE expr (id INT PRIMARY KEY, value INT DEFAULT 7 + 3 * 2)
schema expr
---
CREATE TABLE expr (
  id INTEGER PRIMARY KEY,
  value INTEGER DEFAULT 13
)

# NULL is a value default for a nullable column (and is the implicit default).
> CREATE TABLE "nullable" (id INT PRIMARY KEY, value STRING DEFAULT NULL, implicit STRING)
schema nullable
---
CREATE TABLE nullable (
  id INTEGER PRIMARY KEY,
  value STRING DEFAULT NULL,
  implicit STRING DEFAULT NULL
)

# A NULL default errors for a non-nullable column, including primary keys.
!> CREATE TABLE name (id INT PRIMARY KEY DEFAULT NULL)
!> CREATE TABLE name (id INT PRIMARY KEY, value STRING NOT NULL DEFAULT NULL)
---
Error: invalid input: invalid NULL default for non-nullable column id
Error: invalid input: invalid NULL default for non-nullable column value


================================================
FILE: src/sql/testscripts/schema/create_table_index
================================================
# Creating a table with an index only results in a single schema entry (no
# separate index).
[ops]> CREATE TABLE indexed (id INTEGER PRIMARY KEY, "index" INTEGER INDEX)
---
set mvcc:NextVersion → 2 ["\x00" → "\x02"]
set mvcc:TxnActive(1) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x01" → ""]
set mvcc:TxnWrite(1, sql:Table(indexed)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\xffindexed\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Table(indexed), 1) → CREATE TABLE indexed ( id INTEGER PRIMARY KEY, "index" INTEGER DEFAULT NULL INDEX ) ["\x04\x00\xffindexed\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01 \x07indexed\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05index\x01\x01\x01\x00\x00\x01\x00"]
delete mvcc:TxnWrite(1, sql:Table(indexed)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\xffindexed\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnActive(1) ["\x01\x00\x00\x00\x00\x00\x00\x00\x01"]

schema
---
CREATE TABLE indexed (
  id INTEGER PRIMARY KEY,
  "index" INTEGER DEFAULT NULL INDEX
)

# Explicit indexes can be given for primary keys, foreign keys,
# and unique columns.
> CREATE TABLE explicit ( \
    id INTEGER PRIMARY KEY INDEX, \
    "unique" INTEGER UNIQUE INDEX, \
    "reference" INTEGER REFERENCES indexed INDEX \
)
schema explicit
---
CREATE TABLE explicit (
  id INTEGER PRIMARY KEY,
  "unique" INTEGER DEFAULT NULL UNIQUE INDEX,
  reference INTEGER DEFAULT NULL INDEX REFERENCES indexed
)


================================================
FILE: src/sql/testscripts/schema/create_table_names
================================================
# Tests CREATE TABLE table and column name validation.

# A couple of valid names.
> CREATE TABLE a_123 (a_123 INTEGER PRIMARY KEY)
> CREATE TABLE 表 (身元 INTEGER PRIMARY KEY, 名前 STRING)
schema
---
CREATE TABLE a_123 (
  a_123 INTEGER PRIMARY KEY
)
CREATE TABLE 表 (
  身元 INTEGER PRIMARY KEY,
  名前 STRING DEFAULT NULL
)

# Mixed case is valid, but interpreted as lower case. Quoted identifiers retain
# their case.
> CREATE TABLE mIxEd_cAsE (ÄÅÆ STRING PRIMARY KEY)
> CREATE TABLE "mIxEd_cAsE" ("ÄÅÆ" STRING PRIMARY KEY)
schema mixed_case
schema mIxEd_cAsE
---
CREATE TABLE mixed_case (
  äåæ STRING PRIMARY KEY
)
CREATE TABLE mIxEd_cAsE (
  ÄÅÆ STRING PRIMARY KEY
)

# Unquoted _, number, keyword, and emoji errors.
!> CREATE TABLE _name (id INTEGER PRIMARY KEY)
!> CREATE TABLE 123 (1 INTEGER PRIMARY KEY)
!> CREATE TABLE table (primary INTEGER PRIMARY KEY)
!> CREATE TABLE 👋 (🆔 INTEGER PRIMARY KEY)
---
Error: invalid input: unexpected character _
Error: invalid input: expected identifier, got 123
Error: invalid input: expected identifier, got TABLE
Error: invalid input: unexpected character 👋

# Double quotes allow them.
> CREATE TABLE "_name" (id INTEGER PRIMARY KEY)
> CREATE TABLE "123" ("1" INTEGER PRIMARY KEY)
> CREATE TABLE "table" ("primary" INTEGER PRIMARY KEY)
> CREATE TABLE "👋" ("🆔" INTEGER PRIMARY KEY)
schema _name 123 table "👋"
---
CREATE TABLE "_name" (
  id INTEGER PRIMARY KEY
)
CREATE TABLE "123" (
  "1" INTEGER PRIMARY KEY
)
CREATE TABLE "table" (
  "primary" INTEGER PRIMARY KEY
)
CREATE TABLE "👋" (
  "🆔" INTEGER PRIMARY KEY
)

# "" escapes " in identifiers.
> CREATE TABLE "name with ""quotes""" (id INTEGER PRIMARY KEY);
schema 'name with "quotes"'
---
CREATE TABLE "name with ""quotes""" (
  id INTEGER PRIMARY KEY
)

# ' are for string literals, not identifiers.
!> CREATE TABLE 'name' (id INTEGER PRIMARY KEY)
---
Error: invalid input: expected identifier, got name


================================================
FILE: src/sql/testscripts/schema/create_table_null
================================================
# Tests column nullability.

# All datatypes can be nullable. Their default value is NULL.
> CREATE TABLE datatypes ( \
    id INTEGER PRIMARY KEY, \
    "bool" BOOLEAN NULL, \
    "float" FLOAT NULL, \
    "int" INTEGER NULL, \
    "string" STRING NULL \
)
schema datatypes
---
CREATE TABLE datatypes (
  id INTEGER PRIMARY KEY,
  "bool" BOOLEAN DEFAULT NULL,
  "float" FLOAT DEFAULT NULL,
  "int" INTEGER DEFAULT NULL,
  "string" STRING DEFAULT NULL
)

# Column can be made explicitly non-nullable.
> CREATE TABLE non_null (id INTEGER PRIMARY KEY, value STRING NOT NULL)
schema non_null
---
CREATE TABLE non_null (
  id INTEGER PRIMARY KEY,
  value STRING NOT NULL
)

# Column can't be both nullable and non-nullable.
!> CREATE TABLE test (id INTEGER PRIMARY KEY, value STRING NULL NOT NULL)
---
Error: invalid input: nullability already set for column value


================================================
FILE: src/sql/testscripts/schema/create_table_primary_key
================================================
# Tests primary keys.

# There must be exactly one primary key.
!> CREATE TABLE "primary" (id INTEGER)
!> CREATE TABLE "primary" (id INTEGER PRIMARY KEY, name STRING PRIMARY KEY)
> CREATE TABLE "primary" (id INTEGER PRIMARY KEY)
schema primary
---
Error: invalid input: no primary key for table primary
Error: invalid input: multiple primary keys for table primary
CREATE TABLE "primary" (
  id INTEGER PRIMARY KEY
)

# The primary key can't be nullable.
!> CREATE TABLE "null" (id INTEGER PRIMARY KEY NULL)
---
Error: invalid input: primary key id cannot be nullable

# It can have a default value though.
> CREATE TABLE "default" (id INTEGER PRIMARY KEY DEFAULT 1)
---
ok

# Primary keys can also take all datatypes.
> CREATE TABLE "bool" (id BOOL PRIMARY KEY)
> CREATE TABLE "float" (id FLOAT PRIMARY KEY)
> CREATE TABLE "string" (id STRING PRIMARY KEY)
schema bool float string
---
CREATE TABLE "bool" (
  id BOOLEAN PRIMARY KEY
)
CREATE TABLE "float" (
  id FLOAT PRIMARY KEY
)
CREATE TABLE "string" (
  id STRING PRIMARY KEY
)


================================================
FILE: src/sql/testscripts/schema/create_table_reference
================================================
# Tests foreign key references during CREATE TABLE.

# Create two reference tables, with int/string primary keys.
> CREATE TABLE "ref" (id INT PRIMARY KEY, value STRING NOT NULL)
> INSERT INTO "ref" VALUES (1, 'a'), (2, 'b')
---
ok

> CREATE TABLE sref (id STRING PRIMARY KEY, value INTEGER NOT NULL)
> INSERT INTO sref VALUES ('a', 1), ('b', 2)
---
ok

# Creating a table with references works. The reference columns get implicit
# indexes, but only a single schema entity.
[ops]> CREATE TABLE name (id INT PRIMARY KEY, ref_id INT REFERENCES "ref", sref_id STRING REFERENCES sref)
---
set mvcc:NextVersion → 6 ["\x00" → "\x06"]
set mvcc:TxnActive(5) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x05" → ""]
set mvcc:TxnWrite(5, sql:Table(name)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x00\xffname\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Table(name), 5) → CREATE TABLE name ( id INTEGER PRIMARY KEY, ref_id INTEGER DEFAULT NULL INDEX REFERENCES ref, sref_id STRING DEFAULT NULL INDEX REFERENCES sref ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x016\x04name\x00\x03\x02id\x01\x00\x00\x01\x00\x00\x06ref_id\x01\x01\x01\x00\x00\x01\x01\x03ref\x07sref_id\x03\x01\x01\x00\x00\x01\x01\x04sref"]
delete mvcc:TxnWrite(5, sql:Table(name)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x00\xffname\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnActive(5) ["\x01\x00\x00\x00\x00\x00\x00\x00\x05"]

schema name
---
CREATE TABLE name (
  id INTEGER PRIMARY KEY,
  ref_id INTEGER DEFAULT NULL INDEX REFERENCES ref,
  sref_id STRING DEFAULT NULL INDEX REFERENCES sref
)

dump
---
mvcc:NextVersion → 6 ["\x00" → "\x06"]
mvcc:Version(sql:Table(name), 5) → CREATE TABLE name ( id INTEGER PRIMARY KEY, ref_id INTEGER DEFAULT NULL INDEX REFERENCES ref, sref_id STRING DEFAULT NULL INDEX REFERENCES sref ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x016\x04name\x00\x03\x02id\x01\x00\x00\x01\x00\x00\x06ref_id\x01\x01\x01\x00\x00\x01\x01\x03ref\x07sref_id\x03\x01\x01\x00\x00\x01\x01\x04sref"]
mvcc:Version(sql:Table(ref), 1) → CREATE TABLE ref ( id INTEGER PRIMARY KEY, value STRING NOT NULL ) ["\x04\x00\xffref\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1b\x03ref\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x00\x00\x00\x00\x00"]
mvcc:Version(sql:Table(sref), 3) → CREATE TABLE sref ( id STRING PRIMARY KEY, value INTEGER NOT NULL ) ["\x04\x00\xffsref\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x1c\x04sref\x00\x02\x02id\x03\x00\x00\x01\x00\x00\x05value\x01\x00\x00\x00\x00\x00"]
mvcc:Version(sql:Row(ref, 1), 2) → 1,'a' ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(ref, 2), 2) → 2,'b' ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x04\x04\x01b"]
mvcc:Version(sql:Row(sref, 'a'), 4) → 'a',1 ["\x04\x02sref\x00\xff\x00\xff\x04a\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x06\x02\x04\x01a\x02\x02"]
mvcc:Version(sql:Row(sref, 'b'), 4) → 'b',2 ["\x04\x02sref\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x06\x02\x04\x01b\x02\x04"]

# Missing reference table errors.
!> CREATE TABLE test (id INT PRIMARY KEY, "ref" INT REFERENCES missing)
---
Error: invalid input: unknown table missing referenced by column ref

# Reference type conflicts errors.
!> CREATE TABLE test (id INT PRIMARY KEY, ref_id FLOAT REFERENCES "ref")
!> CREATE TABLE test (id INT PRIMARY KEY, sref_id INT REFERENCES sref)
---
Error: invalid input: can't reference INTEGER primary key of ref from FLOAT column ref_id
Error: invalid input: can't reference STRING primary key of sref from INTEGER column sref_id

# Self-references work.
> CREATE TABLE self (id INT PRIMARY KEY, self_id INT REFERENCES self)
schema self
---
CREATE TABLE self (
  id INTEGER PRIMARY KEY,
  self_id INTEGER DEFAULT NULL INDEX REFERENCES self
)


================================================
FILE: src/sql/testscripts/schema/create_table_transaction
================================================
# Tests that CREATE TABLE is transactional.

> BEGIN
[ops]> CREATE TABLE name (id INT PRIMARY KEY, value STRING)
---
set mvcc:TxnWrite(1, sql:Table(name)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\xffname\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Table(name), 1) → CREATE TABLE name ( id INTEGER PRIMARY KEY, value STRING DEFAULT NULL ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1d\x04name\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x01\x01\x00\x00\x00\x00"]

schema name
---
CREATE TABLE name (
  id INTEGER PRIMARY KEY,
  value STRING DEFAULT NULL
)

dump
---
mvcc:NextVersion → 2 ["\x00" → "\x02"]
mvcc:TxnActive(1) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x01" → ""]
mvcc:TxnWrite(1, sql:Table(name)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\xffname\x00\xff\x00\xff\x00\x00" → ""]
mvcc:Version(sql:Table(name), 1) → CREATE TABLE name ( id INTEGER PRIMARY KEY, value STRING DEFAULT NULL ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1d\x04name\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x01\x01\x00\x00\x00\x00"]

# Rolling it back undoes it.
[ops]> ROLLBACK
---
delete mvcc:Version(sql:Table(name), 1) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"]
delete mvcc:TxnWrite(1, sql:Table(name)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\xffname\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnActive(1) ["\x01\x00\x00\x00\x00\x00\x00\x00\x01"]

dump
---
mvcc:NextVersion → 2 ["\x00" → "\x02"]

# Committing a table also works.
> BEGIN
> CREATE TABLE name (id INT PRIMARY KEY, value STRING)
[ops]> COMMIT
---
delete mvcc:TxnWrite(2, sql:Table(name)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x00\xffname\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnActive(2) ["\x01\x00\x00\x00\x00\x00\x00\x00\x02"]

dump
---
mvcc:NextVersion → 3 ["\x00" → "\x03"]
mvcc:Version(sql:Table(name), 2) → CREATE TABLE name ( id INTEGER PRIMARY KEY, value STRING DEFAULT NULL ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x1d\x04name\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x01\x01\x00\x00\x00\x00"]


================================================
FILE: src/sql/testscripts/schema/create_table_unique
================================================
# Creating a table with a unique index only results in a single schema entry (no
# separate index).
[ops]> CREATE TABLE indexed (id INTEGER PRIMARY KEY, "index" INTEGER UNIQUE)
---
set mvcc:NextVersion → 2 ["\x00" → "\x02"]
set mvcc:TxnActive(1) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x01" → ""]
set mvcc:TxnWrite(1, sql:Table(indexed)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\xffindexed\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Table(indexed), 1) → CREATE TABLE indexed ( id INTEGER PRIMARY KEY, "index" INTEGER DEFAULT NULL UNIQUE INDEX ) ["\x04\x00\xffindexed\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01 \x07indexed\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05index\x01\x01\x01\x00\x01\x01\x00"]
delete mvcc:TxnWrite(1, sql:Table(indexed)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\xffindexed\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnActive(1) ["\x01\x00\x00\x00\x00\x00\x00\x00\x01"]

# The column gets an implicit secondary index marker.
schema
---
CREATE TABLE indexed (
  id INTEGER PRIMARY KEY,
  "index" INTEGER DEFAULT NULL UNIQUE INDEX
)

# Unique indexes work for primary key, foreign key, nullable, non-nullable, and
# default columns.
> CREATE TABLE "unique" ( \
  id INTEGER PRIMARY KEY UNIQUE, \
  ref INTEGER REFERENCES indexed UNIQUE, \
  nullable INTEGER NULL UNIQUE, \
  non_nullable INTEGER NOT NULL UNIQUE, \
  "default" INTEGER DEFAULT 7 UNIQUE \
)
schema unique
---
CREATE TABLE "unique" (
  id INTEGER PRIMARY KEY,
  ref INTEGER DEFAULT NULL UNIQUE INDEX REFERENCES indexed,
  nullable INTEGER DEFAULT NULL UNIQUE INDEX,
  non_nullable INTEGER NOT NULL UNIQUE INDEX,
  "default" INTEGER DEFAULT 7 UNIQUE INDEX
)


================================================
FILE: src/sql/testscripts/schema/drop_table
================================================
# Basic DROP TABLE tests.

> CREATE TABLE name (id INT PRIMARY KEY, value STRING NOT NULL)
> INSERT INTO name VALUES (1, 'a'), (2, 'b')
---
ok

# Dropping a simple table works, and removes the schema entry and rows.
# Also output the trivial plan and statement result.
[plan,result,ops]> DROP TABLE name
---
DropTable: name
set mvcc:NextVersion → 4 ["\x00" → "\x04"]
set mvcc:TxnActive(3) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x03" → ""]
set mvcc:TxnWrite(3, sql:Table(name)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\xffname\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Table(name), 3) → None ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
set mvcc:TxnWrite(3, sql:Row(name, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 1), 3) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
set mvcc:TxnWrite(3, sql:Row(name, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 2), 3) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
delete mvcc:TxnWrite(3, sql:Table(name)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\xffname\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(3, sql:Row(name, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(3, sql:Row(name, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(3) ["\x01\x00\x00\x00\x00\x00\x00\x00\x03"]
DropTable { name: "name", existed: true }

schema
---
ok

dump
---
mvcc:NextVersion → 4 ["\x00" → "\x04"]
mvcc:Version(sql:Table(name), 1) → CREATE TABLE name ( id INTEGER PRIMARY KEY, value STRING NOT NULL ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1c\x04name\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x00\x00\x00\x00\x00"]
mvcc:Version(sql:Table(name), 3) → None ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
mvcc:Version(sql:Row(name, 1), 2) → 1,'a' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(name, 1), 3) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
mvcc:Version(sql:Row(name, 2), 2) → 2,'b' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x04\x04\x01b"]
mvcc:Version(sql:Row(name, 2), 3) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]

# Dropping a missing table errors, but not if IF EXISTS is given.
!> DROP TABLE name
---
Error: invalid input: table name does not exist

[result,ops]> DROP TABLE IF EXISTS name
---
set mvcc:NextVersion → 6 ["\x00" → "\x06"]
set mvcc:TxnActive(5) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x05" → ""]
delete mvcc:TxnActive(5) ["\x01\x00\x00\x00\x00\x00\x00\x00\x05"]
DropTable { name: "name", existed: false }

# No table or multiple tables errors.
!> DROP TABLE
!> DROP TABLE a, b, c
---
Error: invalid input: unexpected end of input
Error: invalid input: unexpected token ,


================================================
FILE: src/sql/testscripts/schema/drop_table_index
================================================
# Tests that DROP TABLE cleans up secondary indexes of all kinds.

> CREATE TABLE "ref" (id INT PRIMARY KEY, value STRING NOT NULL)
> INSERT INTO "ref" VALUES (1, 'a'), (2, 'b')
---
ok

> CREATE TABLE name ( \
  id INT PRIMARY KEY, \
  "index" STRING, \
  "unique" INT UNIQUE NOT NULL, \
  ref_id INT REFERENCES "ref" \
)
> INSERT INTO name VALUES (1, 'foo', 1, 1)
> INSERT INTO name VALUES (2, 'bar', 2, 2)
> INSERT INTO name VALUES (3, 'foo', 3, NULL)
> INSERT INTO name VALUES (4, NULL, 4, 2)
---
ok

dump
---
mvcc:NextVersion → 8 ["\x00" → "\x08"]
mvcc:Version(sql:Table(name), 3) → CREATE TABLE name ( id INTEGER PRIMARY KEY, "index" STRING DEFAULT NULL, "unique" INTEGER NOT NULL UNIQUE INDEX, ref_id INTEGER DEFAULT NULL INDEX REFERENCES ref ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01<\x04name\x00\x04\x02id\x01\x00\x00\x01\x00\x00\x05index\x03\x01\x01\x00\x00\x00\x00\x06unique\x01\x00\x00\x01\x01\x00\x06ref_id\x01\x01\x01\x00\x00\x01\x01\x03ref"]
mvcc:Version(sql:Table(ref), 1) → CREATE TABLE ref ( id INTEGER PRIMARY KEY, value STRING NOT NULL ) ["\x04\x00\xffref\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1b\x03ref\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x00\x00\x00\x00\x00"]
mvcc:Version(sql:Index(name.ref_id, NULL), 6) → 3 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x03\x01\x02\x06"]
mvcc:Version(sql:Index(name.ref_id, 1), 4) → 1 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x02"]
mvcc:Version(sql:Index(name.ref_id, 2), 5) → 2 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x03\x01\x02\x04"]
mvcc:Version(sql:Index(name.ref_id, 2), 7) → 2,4 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x05\x02\x02\x04\x02\x08"]
mvcc:Version(sql:Index(name.unique, 1), 4) → 1 ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x02"]
mvcc:Version(sql:Index(name.unique, 2), 5) → 2 ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x03\x01\x02\x04"]
mvcc:Version(sql:Index(name.unique, 3), 6) → 3 ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x03\x01\x02\x06"]
mvcc:Version(sql:Index(name.unique, 4), 7) → 4 ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x03\x01\x02\x08"]
mvcc:Version(sql:Row(name, 1), 4) → 1,'foo',1,1 ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x0c\x04\x02\x02\x04\x03foo\x02\x02\x02\x02"]
mvcc:Version(sql:Row(name, 2), 5) → 2,'bar',2,2 ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x0c\x04\x02\x04\x04\x03bar\x02\x04\x02\x04"]
mvcc:Version(sql:Row(name, 3), 6) → 3,'foo',3,NULL ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x0b\x04\x02\x06\x04\x03foo\x02\x06\x00"]
mvcc:Version(sql:Row(name, 4), 7) → 4,NULL,4,2 ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x08\x04\x02\x08\x00\x02\x08\x02\x04"]
mvcc:Version(sql:Row(ref, 1), 2) → 1,'a' ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(ref, 2), 2) → 2,'b' ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x04\x04\x01b"]

# Dropping the table deletes all index entries.
[ops]> DROP TABLE name
> DROP TABLE ref
---
set mvcc:NextVersion → 9 ["\x00" → "\t"]
set mvcc:TxnActive(8) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x08" → ""]
set mvcc:TxnWrite(8, sql:Table(name)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x00\xffname\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Table(name), 8) → None ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Row(name, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 1), 8) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Row(name, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 2), 8) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Row(name, 3)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 3), 8) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Row(name, 4)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 4), 8) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Index(name.unique, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(name.unique, 1), 8) → None ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Index(name.unique, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Index(name.unique, 2), 8) → None ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Index(name.unique, 3)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00" → ""]
set mvcc:Version(sql:Index(name.unique, 3), 8) → None ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Index(name.unique, 4)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00" → ""]
set mvcc:Version(sql:Index(name.unique, 4), 8) → None ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Index(name.ref_id, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.ref_id, NULL), 8) → None ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Index(name.ref_id, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(name.ref_id, 1), 8) → None ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Index(name.ref_id, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Index(name.ref_id, 2), 8) → None ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
delete mvcc:TxnWrite(8, sql:Table(name)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x00\xffname\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(name.ref_id, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(name.ref_id, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(name.ref_id, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(name.unique, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(name.unique, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(name.unique, 3)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(name.unique, 4)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00"]
delete mvcc:TxnWrite(8, sql:Row(name, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(8, sql:Row(name, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnWrite(8, sql:Row(name, 3)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00"]
delete mvcc:TxnWrite(8, sql:Row(name, 4)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00"]
delete mvcc:TxnActive(8) ["\x01\x00\x00\x00\x00\x00\x00\x00\x08"]

dump
---
mvcc:NextVersion → 10 ["\x00" → "\n"]
mvcc:Version(sql:Table(name), 3) → CREATE TABLE name ( id INTEGER PRIMARY KEY, "index" STRING DEFAULT NULL, "unique" INTEGER NOT NULL UNIQUE INDEX, ref_id INTEGER DEFAULT NULL INDEX REFERENCES ref ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01<\x04name\x00\x04\x02id\x01\x00\x00\x01\x00\x00\x05index\x03\x01\x01\x00\x00\x00\x00\x06unique\x01\x00\x00\x01\x01\x00\x06ref_id\x01\x01\x01\x00\x00\x01\x01\x03ref"]
mvcc:Version(sql:Table(name), 8) → None ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
mvcc:Version(sql:Table(ref), 1) → CREATE TABLE ref ( id INTEGER PRIMARY KEY, value STRING NOT NULL ) ["\x04\x00\xffref\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1b\x03ref\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x00\x00\x00\x00\x00"]
mvcc:Version(sql:Table(ref), 9) → None ["\x04\x00\xffref\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x00"]
mvcc:Version(sql:Index(name.ref_id, NULL), 6) → 3 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x03\x01\x02\x06"]
mvcc:Version(sql:Index(name.ref_id, NULL), 8) → None ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
mvcc:Version(sql:Index(name.ref_id, 1), 4) → 1 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x02"]
mvcc:Version(sql:Index(name.ref_id, 1), 8) → None ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
mvcc:Version(sql:Index(name.ref_id, 2), 5) → 2 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x03\x01\x02\x04"]
mvcc:Version(sql:Index(name.ref_id, 2), 7) → 2,4 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x05\x02\x02\x04\x02\x08"]
mvcc:Version(sql:Index(name.ref_id, 2), 8) → None ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
mvcc:Version(sql:Index(name.unique, 1), 4) → 1 ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x02"]
mvcc:Version(sql:Index(name.unique, 1), 8) → None ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
mvcc:Version(sql:Index(name.unique, 2), 5) → 2 ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x03\x01\x02\x04"]
mvcc:Version(sql:Index(name.unique, 2), 8) → None ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
mvcc:Version(sql:Index(name.unique, 3), 6) → 3 ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x03\x01\x02\x06"]
mvcc:Version(sql:Index(name.unique, 3), 8) → None ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
mvcc:Version(sql:Index(name.unique, 4), 7) → 4 ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x03\x01\x02\x08"]
mvcc:Version(sql:Index(name.unique, 4), 8) → None ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
mvcc:Version(sql:Row(name, 1), 4) → 1,'foo',1,1 ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x0c\x04\x02\x02\x04\x03foo\x02\x02\x02\x02"]
mvcc:Version(sql:Row(name, 1), 8) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
mvcc:Version(sql:Row(name, 2), 5) → 2,'bar',2,2 ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x0c\x04\x02\x04\x04\x03bar\x02\x04\x02\x04"]
mvcc:Version(sql:Row(name, 2), 8) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
mvcc:Version(sql:Row(name, 3), 6) → 3,'foo',3,NULL ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x0b\x04\x02\x06\x04\x03foo\x02\x06\x00"]
mvcc:Version(sql:Row(name, 3), 8) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
mvcc:Version(sql:Row(name, 4), 7) → 4,NULL,4,2 ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x08\x04\x02\x08\x00\x02\x08\x02\x04"]
mvcc:Version(sql:Row(name, 4), 8) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
mvcc:Version(sql:Row(ref, 1), 2) → 1,'a' ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(ref, 1), 9) → None ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x00"]
mvcc:Version(sql:Row(ref, 2), 2) → 2,'b' ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x04\x04\x01b"]
mvcc:Version(sql:Row(ref, 2), 9) → None ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x00"]


================================================
FILE: src/sql/testscripts/schema/drop_table_ref
================================================
# Tests DROP TABLE with references.

# Create a reference table and foreign key table.
> CREATE TABLE "ref" (id INT PRIMARY KEY)
> CREATE TABLE name (id INT PRIMARY KEY, ref_if INT REFERENCES "ref")
---
ok

# Dropping a table with a foreign key reference to it errors.
!> DROP TABLE "ref"
---
Error: invalid input: table ref is referenced from name.ref_if

# But it works if the source table is dropped first.
> DROP TABLE name
> DROP TABLE "ref"
---
ok

# Dropping a table with a self reference also works.
> CREATE TABLE self (id INT PRIMARY KEY, self_id INT REFERENCES self)
---
ok

> DROP TABLE self
---
ok

schema
---
ok


================================================
FILE: src/sql/testscripts/schema/drop_table_transaction
================================================
# Tests that DROP TABLE is transactional.

> CREATE TABLE name (id INT PRIMARY KEY, value STRING)
> INSERT INTO name VALUES (1, 'a'), (2, 'b')
---
ok

dump
---
mvcc:NextVersion → 3 ["\x00" → "\x03"]
mvcc:Version(sql:Table(name), 1) → CREATE TABLE name ( id INTEGER PRIMARY KEY, value STRING DEFAULT NULL ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1d\x04name\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x01\x01\x00\x00\x00\x00"]
mvcc:Version(sql:Row(name, 1), 2) → 1,'a' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(name, 2), 2) → 2,'b' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x04\x04\x01b"]

# Drop the table in a transaction.
> BEGIN
[ops]> DROP TABLE name
---
set mvcc:TxnWrite(3, sql:Table(name)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\xffname\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Table(name), 3) → None ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
set mvcc:TxnWrite(3, sql:Row(name, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 1), 3) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
set mvcc:TxnWrite(3, sql:Row(name, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 2), 3) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]

schema
---
ok

dump
---
mvcc:NextVersion → 4 ["\x00" → "\x04"]
mvcc:TxnActive(3) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x03" → ""]
mvcc:TxnWrite(3, sql:Table(name)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\xffname\x00\xff\x00\xff\x00\x00" → ""]
mvcc:TxnWrite(3, sql:Row(name, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
mvcc:TxnWrite(3, sql:Row(name, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
mvcc:Version(sql:Table(name), 1) → CREATE TABLE name ( id INTEGER PRIMARY KEY, value STRING DEFAULT NULL ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1d\x04name\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x01\x01\x00\x00\x00\x00"]
mvcc:Version(sql:Table(name), 3) → None ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
mvcc:Version(sql:Row(name, 1), 2) → 1,'a' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(name, 1), 3) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
mvcc:Version(sql:Row(name, 2), 2) → 2,'b' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x04\x04\x01b"]
mvcc:Version(sql:Row(name, 2), 3) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]

# Rolling it back undoes it.
[ops]> ROLLBACK
---
delete mvcc:Version(sql:Table(name), 3) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"]
delete mvcc:TxnWrite(3, sql:Table(name)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\xffname\x00\xff\x00\xff\x00\x00"]
delete mvcc:Version(sql:Row(name, 1), 3) ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"]
delete mvcc:TxnWrite(3, sql:Row(name, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:Version(sql:Row(name, 2), 3) ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"]
delete mvcc:TxnWrite(3, sql:Row(name, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(3) ["\x01\x00\x00\x00\x00\x00\x00\x00\x03"]

dump
---
mvcc:NextVersion → 4 ["\x00" → "\x04"]
mvcc:Version(sql:Table(name), 1) → CREATE TABLE name ( id INTEGER PRIMARY KEY, value STRING DEFAULT NULL ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1d\x04name\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x01\x01\x00\x00\x00\x00"]
mvcc:Version(sql:Row(name, 1), 2) → 1,'a' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(name, 2), 2) → 2,'b' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x04\x04\x01b"]

# Committing the drop also works.
> BEGIN
> DROP TABLE name
[ops]> COMMIT
---
delete mvcc:TxnWrite(4, sql:Table(name)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\xffname\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(4, sql:Row(name, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(4, sql:Row(name, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(4) ["\x01\x00\x00\x00\x00\x00\x00\x00\x04"]

dump
---
mvcc:NextVersion → 5 ["\x00" → "\x05"]
mvcc:Version(sql:Table(name), 1) → CREATE TABLE name ( id INTEGER PRIMARY KEY, value STRING DEFAULT NULL ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1d\x04name\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x01\x01\x00\x00\x00\x00"]
mvcc:Version(sql:Table(name), 4) → None ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x00"]
mvcc:Version(sql:Row(name, 1), 2) → 1,'a' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(name, 1), 4) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x00"]
mvcc:Version(sql:Row(name, 2), 2) → 2,'b' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x04\x04\x01b"]
mvcc:Version(sql:Row(name, 2), 4) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x00"]


================================================
FILE: src/sql/testscripts/transactions/anomaly_dirty_read
================================================
# A dirty read is when c2 can read an uncommitted value set by c1. Snapshot
# isolation prevents this.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
---
ok

c1:> BEGIN
c1:> INSERT INTO test VALUES (1, 'a')
---
ok

c2:> BEGIN
c2:> SELECT * FROM test WHERE id = 1
---
ok


================================================
FILE: src/sql/testscripts/transactions/anomaly_dirty_write
================================================
# A dirty write is when c2 overwrites an uncommitted value written by c1.
# Snapshot isolation prevents this.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
---
ok

c1:> BEGIN
c1:> INSERT INTO test VALUES (1, 'a')
---
ok

c2:> BEGIN
c2:!> INSERT INTO test VALUES (1, 'a')
---
c2: Error: serialization failure, retry transaction


================================================
FILE: src/sql/testscripts/transactions/anomaly_fuzzy_read
================================================
# A fuzzy (or unrepeatable) read is when c2 sees a value change after c1
# updates it. Snapshot isolation prevents this.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
> INSERT INTO test VALUES (1, 'a')
---
ok

c1:> BEGIN
c2:> BEGIN
---
ok

c2:> SELECT * FROM test WHERE id = 1
---
c2: 1, 'a'

c1:> UPDATE test SET value = 'b' WHERE id = 1
c1:> COMMIT
c1:> SELECT * FROM test
---
c1: 1, 'b'

c2:> SELECT * FROM test WHERE id = 1
---
c2: 1, 'a'


================================================
FILE: src/sql/testscripts/transactions/anomaly_lost_update
================================================
# A lost update is when c1 and c2 both read a value and update it, where
# c2's update replaces c1. Snapshot isolation prevents this.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
---
ok


c1:> BEGIN
c1:> SELECT * FROM test WHERE id = 1
---
ok

c2:> BEGIN
c2:> SELECT * FROM test WHERE id = 1
---
ok

c1:> INSERT INTO test VALUES (1, 'a')
c1:> COMMIT
---
ok

c2:!> INSERT INTO test VALUES (1, 'a')
---
c2: Error: serialization failure, retry transaction


================================================
FILE: src/sql/testscripts/transactions/anomaly_phantom_read
================================================
# A phantom read is when c1 reads entries matching some predicate, but a
# modification by c2 changes which entries match the predicate such that a later
# read by c1 returns them. Snapshot isolation prevents this.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
> INSERT INTO test VALUES (1, 'a'), (2, 'b'), (3, 'c')
---
ok

c1:> BEGIN
c2:> BEGIN
---
ok

c1:> SELECT * FROM test WHERE id > 1
---
c1: 2, 'b'
c1: 3, 'c'

c2:> DELETE FROM test WHERE id = 2
c2:> INSERT INTO test VALUES (4, 'd')
c2:> COMMIT
---
ok

c1:> SELECT * FROM test WHERE id > 1
---
c1: 2, 'b'
c1: 3, 'c'


================================================
FILE: src/sql/testscripts/transactions/anomaly_read_skew
================================================
# Read skew is when c1 reads a and b, but c2 modifies b in between the
# reads. Snapshot isolation prevents this.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
> INSERT INTO test VALUES (1, 'a'), (2, 'b')
---
ok

c1:> BEGIN
c2:> BEGIN
---
ok

c1:> SELECT * FROM test WHERE id = 1
---
c1: 1, 'a'

c2:> UPDATE test SET value = 'b' WHERE id = 1
c2:> UPDATE test SET value = 'a' WHERE id = 2
c2:> COMMIT
---
ok

c1:> SELECT * FROM test WHERE id = 2
---
c1: 2, 'b'


================================================
FILE: src/sql/testscripts/transactions/anomaly_write_skew
================================================
# Write skew is when c1 reads a and writes it to b while c2 reads b and writes
# it to a. Snapshot isolation does not prevent this, which is expected, so we
# assert the anomalous behavior. Fixing this would require implementing
# serializable snapshot isolation.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
> INSERT INTO test VALUES (1, 'a'), (2, 'b')
---
ok

c1:> BEGIN
c2:> BEGIN
---
ok

c1:> SELECT * FROM test WHERE id = 1
c2:> SELECT * FROM test WHERE id = 2
---
c1: 1, 'a'
c2: 2, 'b'

c1:> UPDATE test SET value = 'a' WHERE id = 2
c2:> UPDATE test SET value = 'b' WHERE id = 1
---
ok

c1:> COMMIT
c2:> COMMIT
---
ok

> SELECT * FROM test
---
1, 'b'
2, 'a'


================================================
FILE: src/sql/testscripts/transactions/begin
================================================
# Tests BEGIN.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
> INSERT INTO test VALUES (0, '')
---
ok

# BEGIN starts a new transaction. It bumps NextVersion and writes a TxnActive
# record for itself.
c1:[result,ops]> BEGIN
---
c1: set mvcc:NextVersion → 4 ["\x00" → "\x04"]
c1: set mvcc:TxnActive(3) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x03" → ""]
c1: Begin(TransactionState { version: 3, read_only: false, active: {} })

# Starting another transaction for c1 errors.
c1:!> BEGIN
c1:!> BEGIN READ ONLY
---
c1: Error: invalid input: already in a transaction
c1: Error: invalid input: already in a transaction

# Another client can begin a concurrent transaction, capturing c1's version in
# its active set. The active snapshot is persisted to storage.
c2:[result,ops]> BEGIN
---
c2: set mvcc:NextVersion → 5 ["\x00" → "\x05"]
c2: set mvcc:TxnActiveSnapshot(4) → {3} ["\x02\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03"]
c2: set mvcc:TxnActive(4) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x04" → ""]
c2: Begin(TransactionState { version: 4, read_only: false, active: {3} })

# A read-only transaction doesn't allocate a new version, and doesn't perform
# any storage engine writes. It does capture an active set though, and it can't
# perform any writes.
c3:[result,ops]> BEGIN READ ONLY
c3:!> INSERT INTO test VALUES (0, '')
c3:> ROLLBACK
---
c3: Begin(TransactionState { version: 5, read_only: true, active: {3, 4} })
c3: Error: invalid input: primary key 0 already exists

# c1 writes a value and commits.
c1:> INSERT INTO test VALUES (1, 'a')
c1:> COMMIT
---
ok

# A transaction as of version 1 doesn't see anything, since the
# table was created in this version.
c3:[result,ops]> BEGIN READ ONLY AS OF SYSTEM TIME 1
c3:!> SELECT * FROM test
c3:> ROLLBACK
---
c3: Begin(TransactionState { version: 1, read_only: true, active: {} })
c3: Error: invalid input: table test does not exist

# It sees the table at version 2, but no rows. The row is visible
# at version 3, but not c1's write which was committed at the end
# of version 3.
c3:[result,ops]> BEGIN READ ONLY AS OF SYSTEM TIME 2
c3:> SELECT * FROM test
c3:> ROLLBACK
---
c3: Begin(TransactionState { version: 2, read_only: true, active: {} })

c3:[result,ops]> BEGIN READ ONLY AS OF SYSTEM TIME 3
c3:> SELECT * FROM test
c3:> ROLLBACK
---
c3: Begin(TransactionState { version: 3, read_only: true, active: {} })
c3: 0, ''

# At version 4, we inherit c2's active set which excludes c1, and still can't
# see c1's write.
c3:[result,ops]> BEGIN READ ONLY AS OF SYSTEM TIME 4
c3:> SELECT * FROM test
c3:> ROLLBACK
---
c3: Begin(TransactionState { version: 4, read_only: true, active: {3} })
c3: 0, ''


================================================
FILE: src/sql/testscripts/transactions/commit
================================================
# Tests COMMIT.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
---
ok

# A commit removes the TxnActive record and its TxnWrite records.
[ops,result]> BEGIN
[ops,result]> INSERT INTO test VALUES (1, 'a'), (2, 'b')
[ops,result]> COMMIT
---
set mvcc:NextVersion → 3 ["\x00" → "\x03"]
set mvcc:TxnActive(2) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x02" → ""]
Begin(TransactionState { version: 2, read_only: false, active: {} })
set mvcc:TxnWrite(2, sql:Row(test, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x02test\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(test, 1), 2) → 1,'a' ["\x04\x02test\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
set mvcc:TxnWrite(2, sql:Row(test, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x02test\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(test, 2), 2) → 2,'b' ["\x04\x02test\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x04\x04\x01b"]
Insert { count: 2 }
delete mvcc:TxnWrite(2, sql:Row(test, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x02test\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(2, sql:Row(test, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x02test\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(2) ["\x01\x00\x00\x00\x00\x00\x00\x00\x02"]
Commit { version: 2 }

# A later transaction can see its writes.
c1:> SELECT * FROM test
---
c1: 1, 'a'
c1: 2, 'b'

# If there are concurrent transactions, it does not remove the TxnActiveSnapshot.
c1:> BEGIN
---
ok

c2:[ops,result]> BEGIN
c2:[ops,result]> COMMIT
---
c2: set mvcc:NextVersion → 5 ["\x00" → "\x05"]
c2: set mvcc:TxnActiveSnapshot(4) → {3} ["\x02\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03"]
c2: set mvcc:TxnActive(4) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x04" → ""]
c2: Begin(TransactionState { version: 4, read_only: false, active: {3} })
c2: delete mvcc:TxnActive(4) ["\x01\x00\x00\x00\x00\x00\x00\x00\x04"]
c2: Commit { version: 4 }

# Commit errors when there's no open transaction.
!> COMMIT
---
Error: invalid input: not in a transaction


================================================
FILE: src/sql/testscripts/transactions/isolation
================================================
# Tests transaction isolation.
#
# Transactions are tested more thoroughly in the MVCC tests, this just does some
# basic SQL-level testing.
#
# Sets up a sequence of transactions that each perform a write, and checks
# what they can see.
#
# c1: past, committed before c4 began
# c2: past, commits after c4 began
# c3: past, uncommitted
# c4: test transaction
# c5: future, committed
# c6: future, uncommitted
# c7: future, AS OF version 4

# c1: past, committed before c4 began
c1:> BEGIN
c1:> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
c1:> INSERT INTO test VALUES (1, 'a')
c1:> COMMIT
---
ok

# c2: past, commits after c4 began
c2:> BEGIN
c2:> INSERT INTO test VALUES (2, 'b')
---
ok

# c3: past, uncommitted
c3:> BEGIN
c3:> INSERT INTO test VALUES (3, 'c')
---
ok

# c4: test transaction
c4:[result]> BEGIN
c4:> INSERT INTO test VALUES (4, 'd')
---
c4: Begin(TransactionState { version: 4, read_only: false, active: {2, 3} })

# Commit c2.
c2:> COMMIT
---
ok

# c5: future, committed
c5:> BEGIN
c5:> INSERT INTO test VALUES (5, 'e')
c5:> COMMIT
---
ok

# c6: future, uncommitted
c6:> BEGIN
c6:> INSERT INTO test VALUES (6, 'f')
---
ok

# When c4 scans, it should only see the write of c1 and itself.
c4:> SELECT * FROM test
---
c4: 1, 'a'
c4: 4, 'd'

# An AS OF transaction in version 4 should not see c4's uncomitted write.
c7:> BEGIN READ ONLY AS OF SYSTEM TIME 4
c7:> SELECT * FROM test
c7:> ROLLBACK
---
c7: 1, 'a'

# c4 can commit.
c4:> COMMIT
---
ok

# An implicit transaction should see c1, c2, c4, c5:
> SELECT * FROM test
---
1, 'a'
2, 'b'
4, 'd'
5, 'e'

# An AS OF transaction in version 4 should not see c4's write even after it
# has committed, such that it's consistent with the previous AS OF 4. The
# snapshot is taken out at the start of the version.
c7:> BEGIN READ ONLY AS OF SYSTEM TIME 4
c7:> SELECT * FROM test
c7:> ROLLBACK
---
c7: 1, 'a'


================================================
FILE: src/sql/testscripts/transactions/rollback
================================================
# Tests ROLLBACK.

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
---
ok

# A rollback removes the row, TxnActive record and its TxnWrite records.
[ops,result]> BEGIN
[ops,result]> INSERT INTO test VALUES (1, 'a'), (2, 'b')
[ops,result]> ROLLBACK
---
set mvcc:NextVersion → 3 ["\x00" → "\x03"]
set mvcc:TxnActive(2) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x02" → ""]
Begin(TransactionState { version: 2, read_only: false, active: {} })
set mvcc:TxnWrite(2, sql:Row(test, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x02test\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(test, 1), 2) → 1,'a' ["\x04\x02test\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
set mvcc:TxnWrite(2, sql:Row(test, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x02test\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(test, 2), 2) → 2,'b' ["\x04\x02test\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x04\x04\x01b"]
Insert { count: 2 }
delete mvcc:Version(sql:Row(test, 1), 2) ["\x04\x02test\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"]
delete mvcc:TxnWrite(2, sql:Row(test, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x02test\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:Version(sql:Row(test, 2), 2) ["\x04\x02test\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"]
delete mvcc:TxnWrite(2, sql:Row(test, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x02test\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(2) ["\x01\x00\x00\x00\x00\x00\x00\x00\x02"]
Rollback { version: 2 }

# A later transaction can't see its writes.
c1:> SELECT * FROM test
---
ok

# If there are concurrent transactions, it does not remove the
# TxnActiveSnapshot. This is needed for consistent AS OF queries.
c1:> BEGIN
---
ok

c2:[ops,result]> BEGIN
c2:[ops,result]> ROLLBACK
---
c2: set mvcc:NextVersion → 5 ["\x00" → "\x05"]
c2: set mvcc:TxnActiveSnapshot(4) → {3} ["\x02\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03"]
c2: set mvcc:TxnActive(4) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x04" → ""]
c2: Begin(TransactionState { version: 4, read_only: false, active: {3} })
c2: delete mvcc:TxnActive(4) ["\x01\x00\x00\x00\x00\x00\x00\x00\x04"]
c2: Rollback { version: 4 }

# Rollback errors when there's no open transaction.
!> ROLLBACK
---
Error: invalid input: not in a transaction


================================================
FILE: src/sql/testscripts/transactions/schema
================================================
# Tests that schema changes are transactional.

c1:> BEGIN
c1:[ops]> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
c1:> SELECT * FROM test
---
c1: set mvcc:TxnWrite(1, sql:Table(test)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\xfftest\x00\xff\x00\xff\x00\x00" → ""]
c1: set mvcc:Version(sql:Table(test), 1) → CREATE TABLE test ( id INTEGER PRIMARY KEY, value STRING DEFAULT NULL ) ["\x04\x00\xfftest\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1d\x04test\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x01\x01\x00\x00\x00\x00"]

# A concurrent transaction can't see the uncommitted table.
c2:!> SELECT * FROM test
---
c2: Error: invalid input: table test does not exist

# Rolling back the transaction removes the table.
c1:[ops]> ROLLBACK
---
c1: delete mvcc:Version(sql:Table(test), 1) ["\x04\x00\xfftest\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"]
c1: delete mvcc:TxnWrite(1, sql:Table(test)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\xfftest\x00\xff\x00\xff\x00\x00"]
c1: delete mvcc:TxnActive(1) ["\x01\x00\x00\x00\x00\x00\x00\x00\x01"]

c1:!> SELECT * FROM test
c2:!> SELECT * FROM test
---
c1: Error: invalid input: table test does not exist
c2: Error: invalid input: table test does not exist

# Committing a transaction does reveal the table.
c1:> BEGIN
c1:[ops]> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
---
c1: set mvcc:TxnWrite(2, sql:Table(test)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x00\xfftest\x00\xff\x00\xff\x00\x00" → ""]
c1: set mvcc:Version(sql:Table(test), 2) → CREATE TABLE test ( id INTEGER PRIMARY KEY, value STRING DEFAULT NULL ) ["\x04\x00\xfftest\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x1d\x04test\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x01\x01\x00\x00\x00\x00"]

c2:!> SELECT * FROM test
---
c2: Error: invalid input: table test does not exist

c1:[ops]> COMMIT
---
c1: delete mvcc:TxnWrite(2, sql:Table(test)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x00\xfftest\x00\xff\x00\xff\x00\x00"]
c1: delete mvcc:TxnActive(2) ["\x01\x00\x00\x00\x00\x00\x00\x00\x02"]

c2:> SELECT * FROM test
---
ok


================================================
FILE: src/sql/testscripts/writes/delete
================================================
# Tests basic DELETE.

# Insert some data into a table. We'll use transactions to avoid
# 
> CREATE TABLE name (id INT PRIMARY KEY, value STRING)
> INSERT INTO name VALUES (1, 'a'), (2, 'b'), (3, 'c')
---
ok

# Deleting from a table works. Use a transaction to keep the fixture.
> BEGIN
[plan,ops]> DELETE FROM name
---
Delete: name
└─ Scan: name
set mvcc:TxnWrite(3, sql:Row(name, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 1), 3) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
set mvcc:TxnWrite(3, sql:Row(name, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 2), 3) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
set mvcc:TxnWrite(3, sql:Row(name, 3)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 3), 3) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]

> SELECT * FROM name
> ROLLBACK
---
ok

# Deleting without a table, or with a missing or multiple tables errors.
!> DELETE
!> DELETE FROM
!> DELETE FROM missing
!> DELETE FROM name, foo
---
Error: invalid input: unexpected end of input
Error: invalid input: unexpected end of input
Error: invalid input: table missing does not exist
Error: invalid input: unexpected token ,

# Deleting in an implicit transaction works, and deletes.
[ops]> DELETE FROM name
---
set mvcc:NextVersion → 6 ["\x00" → "\x06"]
set mvcc:TxnActive(5) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x05" → ""]
set mvcc:TxnWrite(5, sql:Row(name, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 1), 5) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x00"]
set mvcc:TxnWrite(5, sql:Row(name, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 2), 5) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x00"]
set mvcc:TxnWrite(5, sql:Row(name, 3)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 3), 5) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x00"]
delete mvcc:TxnWrite(5, sql:Row(name, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(5, sql:Row(name, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnWrite(5, sql:Row(name, 3)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00"]
delete mvcc:TxnActive(5) ["\x01\x00\x00\x00\x00\x00\x00\x00\x05"]

> SELECT * FROM name
---
ok

dump
---
mvcc:NextVersion → 6 ["\x00" → "\x06"]
mvcc:Version(sql:Table(name), 1) → CREATE TABLE name ( id INTEGER PRIMARY KEY, value STRING DEFAULT NULL ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1d\x04name\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x01\x01\x00\x00\x00\x00"]
mvcc:Version(sql:Row(name, 1), 2) → 1,'a' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(name, 1), 5) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x00"]
mvcc:Version(sql:Row(name, 2), 2) → 2,'b' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x04\x04\x01b"]
mvcc:Version(sql:Row(name, 2), 5) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x00"]
mvcc:Version(sql:Row(name, 3), 2) → 3,'c' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x06\x04\x01c"]
mvcc:Version(sql:Row(name, 3), 5) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x00"]

# Bare DELETE errors.
!> DELETE
!> DELETE FROM
---
Error: invalid input: unexpected end of input
Error: invalid input: unexpected end of input

# Unknown table errors.
!> DELETE FROM foo
---
Error: invalid input: table foo does not exist

# LIMIT and ORDER BY clauses error.
!> DELETE FROM name LIMIT 2
!> DELETE FROM name ORDER BY id
---
Error: invalid input: unexpected token LIMIT
Error: invalid input: unexpected token ORDER


================================================
FILE: src/sql/testscripts/writes/delete_index
================================================
# Tests index updates during DELETE.

# Create a table with a few indexes.
> CREATE TABLE ref (id INT PRIMARY KEY, value STRING)
> INSERT INTO ref VALUES (1, 'a'), (2, 'b')
> CREATE TABLE name ( \
    id INT PRIMARY KEY, \
    "index" INT INDEX, \
    "unique" STRING UNIQUE, \
    ref_id INT REFERENCES ref \
)
> INSERT INTO name VALUES (1, 2, 'foo', 1)
> INSERT INTO name VALUES (2, 4, 'bar', 1)
> INSERT INTO name VALUES (3, 6, NULL, 2)
> INSERT INTO name VALUES (4, 8, 'baz', 2)
> INSERT INTO name VALUES (5, 10, NULL, 1)
---
ok

# DELETE updates the secondary indexes.
[ops]> DELETE FROM name WHERE id = 4
---
set mvcc:NextVersion → 10 ["\x00" → "\n"]
set mvcc:TxnActive(9) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\t" → ""]
set mvcc:TxnWrite(9, sql:Index(name.index, 8)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01name\x00\xff\x00\xffindex\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x08\x00\x00" → ""]
set mvcc:Version(sql:Index(name.index, 8), 9) → None ["\x04\x01name\x00\xff\x00\xffindex\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x00"]
set mvcc:TxnWrite(9, sql:Index(name.unique, 'baz')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x04baz\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.unique, 'baz'), 9) → None ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x04baz\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x00"]
set mvcc:TxnWrite(9, sql:Index(name.ref_id, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Index(name.ref_id, 2), 9) → 3 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x03\x01\x02\x06"]
set mvcc:TxnWrite(9, sql:Row(name, 4)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 4), 9) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x00"]
delete mvcc:TxnWrite(9, sql:Index(name.index, 8)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01name\x00\xff\x00\xffindex\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x08\x00\x00"]
delete mvcc:TxnWrite(9, sql:Index(name.ref_id, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnWrite(9, sql:Index(name.unique, 'baz')) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x04baz\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(9, sql:Row(name, 4)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00"]
delete mvcc:TxnActive(9) ["\x01\x00\x00\x00\x00\x00\x00\x00\t"]

# Dump the final state.
> SELECT * FROM name
---
1, 2, 'foo', 1
2, 4, 'bar', 1
3, 6, NULL, 2
5, 10, NULL, 1

dump
---
mvcc:NextVersion → 10 ["\x00" → "\n"]
mvcc:Version(sql:Table(name), 3) → CREATE TABLE name ( id INTEGER PRIMARY KEY, "index" INTEGER DEFAULT NULL INDEX, "unique" STRING DEFAULT NULL UNIQUE INDEX, ref_id INTEGER DEFAULT NULL INDEX REFERENCES ref ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01=\x04name\x00\x04\x02id\x01\x00\x00\x01\x00\x00\x05index\x01\x01\x01\x00\x00\x01\x00\x06unique\x03\x01\x01\x00\x01\x01\x00\x06ref_id\x01\x01\x01\x00\x00\x01\x01\x03ref"]
mvcc:Version(sql:Table(ref), 1) → CREATE TABLE ref ( id INTEGER PRIMARY KEY, value STRING DEFAULT NULL ) ["\x04\x00\xffref\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1c\x03ref\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x01\x01\x00\x00\x00\x00"]
mvcc:Version(sql:Index(name.index, 2), 4) → 1 ["\x04\x01name\x00\xff\x00\xffindex\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x02"]
mvcc:Version(sql:Index(name.index, 4), 5) → 2 ["\x04\x01name\x00\xff\x00\xffindex\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x03\x01\x02\x04"]
mvcc:Version(sql:Index(name.index, 6), 6) → 3 ["\x04\x01name\x00\xff\x00\xffindex\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x06\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x03\x01\x02\x06"]
mvcc:Version(sql:Index(name.index, 8), 7) → 4 ["\x04\x01name\x00\xff\x00\xffindex\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x03\x01\x02\x08"]
mvcc:Version(sql:Index(name.index, 8), 9) → None ["\x04\x01name\x00\xff\x00\xffindex\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x00"]
mvcc:Version(sql:Index(name.index, 10), 8) → 5 ["\x04\x01name\x00\xff\x00\xffindex\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\n\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\n"]
mvcc:Version(sql:Index(name.ref_id, 1), 4) → 1 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x02"]
mvcc:Version(sql:Index(name.ref_id, 1), 5) → 1,2 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x05\x02\x02\x02\x02\x04"]
mvcc:Version(sql:Index(name.ref_id, 1), 8) → 1,2,5 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x07\x03\x02\x02\x02\x04\x02\n"]
mvcc:Version(sql:Index(name.ref_id, 2), 6) → 3 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x03\x01\x02\x06"]
mvcc:Version(sql:Index(name.ref_id, 2), 7) → 3,4 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x05\x02\x02\x06\x02\x08"]
mvcc:Version(sql:Index(name.ref_id, 2), 9) → 3 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x03\x01\x02\x06"]
mvcc:Version(sql:Index(name.unique, NULL), 6) → 3 ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x03\x01\x02\x06"]
mvcc:Version(sql:Index(name.unique, NULL), 8) → 3,5 ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x05\x02\x02\x06\x02\n"]
mvcc:Version(sql:Index(name.unique, 'bar'), 5) → 2 ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x04bar\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x03\x01\x02\x04"]
mvcc:Version(sql:Index(name.unique, 'baz'), 7) → 4 ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x04baz\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x03\x01\x02\x08"]
mvcc:Version(sql:Index(name.unique, 'baz'), 9) → None ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x04baz\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x00"]
mvcc:Version(sql:Index(name.unique, 'foo'), 4) → 1 ["\x04\x01name\x00\xff\x00\xffunique\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x02"]
mvcc:Version(sql:Row(name, 1), 4) → 1,2,'foo',1 ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x0c\x04\x02\x02\x02\x04\x04\x03foo\x02\x02"]
mvcc:Version(sql:Row(name, 2), 5) → 2,4,'bar',1 ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x0c\x04\x02\x04\x02\x08\x04\x03bar\x02\x02"]
mvcc:Version(sql:Row(name, 3), 6) → 3,6,NULL,2 ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x08\x04\x02\x06\x02\x0c\x00\x02\x04"]
mvcc:Version(sql:Row(name, 4), 7) → 4,8,'baz',2 ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x0c\x04\x02\x08\x02\x10\x04\x03baz\x02\x04"]
mvcc:Version(sql:Row(name, 4), 9) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x00"]
mvcc:Version(sql:Row(name, 5), 8) → 5,10,NULL,1 ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x08\x04\x02\n\x02\x14\x00\x02\x02"]
mvcc:Version(sql:Row(ref, 1), 2) → 1,'a' ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(ref, 2), 2) → 2,'b' ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x04\x04\x01b"]


================================================
FILE: src/sql/testscripts/writes/delete_reference
================================================
# Tests DELETE with foreign key constraints.

# Create a few reference tables with data.
> CREATE TABLE ref (id INT PRIMARY KEY, value STRING)
> CREATE TABLE sref (id STRING PRIMARY KEY)
> INSERT INTO ref VALUES (1, 'a'), (2, 'b'), (3, 'c')
> INSERT INTO sref VALUES ('a'), ('b')

> CREATE TABLE name ( \
    id INT PRIMARY KEY, \
    ref_id INT REFERENCES ref, \
    sref_id STRING NOT NULL REFERENCES sref \
)
> INSERT INTO name VALUES (1, 1, 'a')
> INSERT INTO name VALUES (2, NULL, 'b')
> INSERT INTO name VALUES (3, 2, 'b')
> INSERT INTO name VALUES (4, 2, 'a')
> INSERT INTO name VALUES (5, 1, 'a')
---
ok

# DELETE with a reference errors. It does not remove rows that could be removed
# in isolation.
!> DELETE FROM ref
!> DELETE FROM ref WHERE id = 1
---
Error: invalid input: row referenced by name.id=1
Error: invalid input: row referenced by name.id=1

> SELECT * FROM ref
---
1, 'a'
2, 'b'
3, 'c'

# DELETE of an unreferenced row succeeds.
[ops]> DELETE FROM ref WHERE id = 3
---
set mvcc:NextVersion → 14 ["\x00" → "\x0e"]
set mvcc:TxnActive(13) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\r" → ""]
set mvcc:TxnWrite(13, sql:Row(ref, 3)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00" → ""]
set mvcc:Version(sql:Row(ref, 3), 13) → None ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x00"]
delete mvcc:TxnWrite(13, sql:Row(ref, 3)) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00"]
delete mvcc:TxnActive(13) ["\x01\x00\x00\x00\x00\x00\x00\x00\r"]

> SELECT * FROM ref
---
1, 'a'
2, 'b'

# DELETE in the source table succeeds. It also removes the index entries.
[ops]> DELETE FROM name WHERE id = 2 OR id = 3
---
set mvcc:NextVersion → 15 ["\x00" → "\x0f"]
set mvcc:TxnActive(14) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x0e" → ""]
set mvcc:TxnWrite(14, sql:Index(name.ref_id, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.ref_id, NULL), 14) → None ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x00"]
set mvcc:TxnWrite(14, sql:Index(name.sref_id, 'b')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01name\x00\xff\x00\xffsref_id\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.sref_id, 'b'), 14) → 3 ["\x04\x01name\x00\xff\x00\xffsref_id\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x01\x03\x01\x02\x06"]
set mvcc:TxnWrite(14, sql:Row(name, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 2), 14) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x00"]
set mvcc:TxnWrite(14, sql:Index(name.ref_id, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Index(name.ref_id, 2), 14) → 4 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x01\x03\x01\x02\x08"]
set mvcc:TxnWrite(14, sql:Index(name.sref_id, 'b')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01name\x00\xff\x00\xffsref_id\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.sref_id, 'b'), 14) → None ["\x04\x01name\x00\xff\x00\xffsref_id\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x00"]
set mvcc:TxnWrite(14, sql:Row(name, 3)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 3), 14) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x00"]
delete mvcc:TxnWrite(14, sql:Index(name.ref_id, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(14, sql:Index(name.ref_id, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnWrite(14, sql:Index(name.sref_id, 'b')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01name\x00\xff\x00\xffsref_id\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(14, sql:Row(name, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnWrite(14, sql:Row(name, 3)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00"]
delete mvcc:TxnActive(14) ["\x01\x00\x00\x00\x00\x00\x00\x00\x0e"]

# DELETE of a no-longer-referenced row succeeds.
> DELETE FROM sref WHERE id = 'b'
---
ok

# Test self-references.
> CREATE TABLE self (id INT PRIMARY KEY, self_id INT REFERENCES self)
> INSERT INTO self VALUES (1, 1)
> INSERT INTO self VALUES (2, 2)
> INSERT INTO self VALUES (3, 2)
> INSERT INTO self VALUES (4, 2)
---
ok

# Deleting all self-ref rows always works.
> BEGIN
> DELETE FROM self
> SELECT * FROM self
> ROLLBACK
---
ok

# Deleting a referenced row errors.
!> DELETE FROM self WHERE id = 2
---
Error: invalid input: row referenced by self.id=3

# Deleting an unreferenced row works.
> DELETE FROM self WHERE id = 4
---
ok

# Deleting a row only referencing itself works.
> DELETE FROM self WHERE id = 1
---
ok

> SELECT * FROM self
---
2, 2
3, 2

# Dump the raw dataset.
dump
---
mvcc:NextVersion → 25 ["\x00" → "\x19"]
mvcc:Version(sql:Table(name), 5) → CREATE TABLE name ( id INTEGER PRIMARY KEY, ref_id INTEGER DEFAULT NULL INDEX REFERENCES ref, sref_id STRING NOT NULL INDEX REFERENCES sref ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x015\x04name\x00\x03\x02id\x01\x00\x00\x01\x00\x00\x06ref_id\x01\x01\x01\x00\x00\x01\x01\x03ref\x07sref_id\x03\x00\x00\x00\x01\x01\x04sref"]
mvcc:Version(sql:Table(ref), 1) → CREATE TABLE ref ( id INTEGER PRIMARY KEY, value STRING DEFAULT NULL ) ["\x04\x00\xffref\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1c\x03ref\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x01\x01\x00\x00\x00\x00"]
mvcc:Version(sql:Table(self), 16) → CREATE TABLE self ( id INTEGER PRIMARY KEY, self_id INTEGER DEFAULT NULL INDEX REFERENCES self ) ["\x04\x00\xffself\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10" → "\x01$\x04self\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x07self_id\x01\x01\x01\x00\x00\x01\x01\x04self"]
mvcc:Version(sql:Table(sref), 2) → CREATE TABLE sref ( id STRING PRIMARY KEY ) ["\x04\x00\xffsref\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x10\x04sref\x00\x01\x02id\x03\x00\x00\x01\x00\x00"]
mvcc:Version(sql:Index(name.ref_id, NULL), 7) → 2 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x03\x01\x02\x04"]
mvcc:Version(sql:Index(name.ref_id, NULL), 14) → None ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x00"]
mvcc:Version(sql:Index(name.ref_id, 1), 6) → 1 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x03\x01\x02\x02"]
mvcc:Version(sql:Index(name.ref_id, 1), 10) → 1,5 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x05\x02\x02\x02\x02\n"]
mvcc:Version(sql:Index(name.ref_id, 2), 8) → 3 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x06"]
mvcc:Version(sql:Index(name.ref_id, 2), 9) → 3,4 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x05\x02\x02\x06\x02\x08"]
mvcc:Version(sql:Index(name.ref_id, 2), 14) → 4 ["\x04\x01name\x00\xff\x00\xffref_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x01\x03\x01\x02\x08"]
mvcc:Version(sql:Index(name.sref_id, 'a'), 6) → 1 ["\x04\x01name\x00\xff\x00\xffsref_id\x00\xff\x00\xff\x04a\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x03\x01\x02\x02"]
mvcc:Version(sql:Index(name.sref_id, 'a'), 9) → 1,4 ["\x04\x01name\x00\xff\x00\xffsref_id\x00\xff\x00\xff\x04a\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x05\x02\x02\x02\x02\x08"]
mvcc:Version(sql:Index(name.sref_id, 'a'), 10) → 1,4,5 ["\x04\x01name\x00\xff\x00\xffsref_id\x00\xff\x00\xff\x04a\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x07\x03\x02\x02\x02\x08\x02\n"]
mvcc:Version(sql:Index(name.sref_id, 'b'), 7) → 2 ["\x04\x01name\x00\xff\x00\xffsref_id\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x03\x01\x02\x04"]
mvcc:Version(sql:Index(name.sref_id, 'b'), 8) → 2,3 ["\x04\x01name\x00\xff\x00\xffsref_id\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x05\x02\x02\x04\x02\x06"]
mvcc:Version(sql:Index(name.sref_id, 'b'), 14) → None ["\x04\x01name\x00\xff\x00\xffsref_id\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x00"]
mvcc:Version(sql:Index(self.self_id, 1), 17) → 1 ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11" → "\x01\x03\x01\x02\x02"]
mvcc:Version(sql:Index(self.self_id, 1), 24) → None ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18" → "\x00"]
mvcc:Version(sql:Index(self.self_id, 2), 18) → 2 ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x12" → "\x01\x03\x01\x02\x04"]
mvcc:Version(sql:Index(self.self_id, 2), 19) → 2,3 ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x13" → "\x01\x05\x02\x02\x04\x02\x06"]
mvcc:Version(sql:Index(self.self_id, 2), 20) → 2,3,4 ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x14" → "\x01\x07\x03\x02\x04\x02\x06\x02\x08"]
mvcc:Version(sql:Index(self.self_id, 2), 23) → 2,3 ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17" → "\x01\x05\x02\x02\x04\x02\x06"]
mvcc:Version(sql:Row(name, 1), 6) → 1,1,'a' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x08\x03\x02\x02\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(name, 2), 7) → 2,NULL,'b' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x07\x03\x02\x04\x00\x04\x01b"]
mvcc:Version(sql:Row(name, 2), 14) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x00"]
mvcc:Version(sql:Row(name, 3), 8) → 3,2,'b' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x08\x03\x02\x06\x02\x04\x04\x01b"]
mvcc:Version(sql:Row(name, 3), 14) → None ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x00"]
mvcc:Version(sql:Row(name, 4), 9) → 4,2,'a' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x08\x03\x02\x08\x02\x04\x04\x01a"]
mvcc:Version(sql:Row(name, 5), 10) → 5,1,'a' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x08\x03\x02\n\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(ref, 1), 3) → 1,'a' ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x06\x02\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(ref, 2), 3) → 2,'b' ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x06\x02\x02\x04\x04\x01b"]
mvcc:Version(sql:Row(ref, 3), 3) → 3,'c' ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x06\x02\x02\x06\x04\x01c"]
mvcc:Version(sql:Row(ref, 3), 13) → None ["\x04\x02ref\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x00"]
mvcc:Version(sql:Row(self, 1), 17) → 1,1 ["\x04\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11" → "\x01\x05\x02\x02\x02\x02\x02"]
mvcc:Version(sql:Row(self, 1), 24) → None ["\x04\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18" → "\x00"]
mvcc:Version(sql:Row(self, 2), 18) → 2,2 ["\x04\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x12" → "\x01\x05\x02\x02\x04\x02\x04"]
mvcc:Version(sql:Row(self, 3), 19) → 3,2 ["\x04\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x13" → "\x01\x05\x02\x02\x06\x02\x04"]
mvcc:Version(sql:Row(self, 4), 20) → 4,2 ["\x04\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x14" → "\x01\x05\x02\x02\x08\x02\x04"]
mvcc:Version(sql:Row(self, 4), 23) → None ["\x04\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17" → "\x00"]
mvcc:Version(sql:Row(sref, 'a'), 4) → 'a' ["\x04\x02sref\x00\xff\x00\xff\x04a\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x04\x01\x04\x01a"]
mvcc:Version(sql:Row(sref, 'b'), 4) → 'b' ["\x04\x02sref\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x04\x01\x04\x01b"]
mvcc:Version(sql:Row(sref, 'b'), 15) → None ["\x04\x02sref\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f" → "\x00"]


================================================
FILE: src/sql/testscripts/writes/delete_where
================================================
# Tests filtered DELETE statements.

# Create a table with some data.
> CREATE TABLE name (id INT PRIMARY KEY, value STRING, "index" INT INDEX)
> INSERT INTO name VALUES (1, 'a', 1), (2, 'b', 2), (3, 'c', 3), (0, NULL, NULL)
---
ok

# Boolean filters work, and are trivial.
> BEGIN
[plan]> DELETE FROM name WHERE true
> SELECT * FROM name
> ROLLBACK
---
Delete: name
└─ Scan: name

[plan]> DELETE FROM name WHERE false
> SELECT * FROM name
---
Delete: name
└─ Nothing
0, NULL, NULL
1, 'a', 1
2, 'b', 2
3, 'c', 3

# Deleting by primary key lookup works.
> BEGIN
[plan]> DELETE FROM name WHERE id = 1 OR id = 3
> SELECT * FROM name
> ROLLBACK
---
Delete: name
└─ KeyLookup: name (1, 3)
0, NULL, NULL
2, 'b', 2

# Deleting by secondary index lookup works.
> BEGIN
[plan]> DELETE FROM name WHERE "index" = 3
> SELECT * FROM name
> ROLLBACK
---
Delete: name
└─ IndexLookup: name.index (3)
0, NULL, NULL
1, 'a', 1
2, 'b', 2

# Including IS NULL predicates.
> BEGIN
[plan]> DELETE FROM name WHERE "index" IS NULL
> SELECT * FROM name
> ROLLBACK
---
Delete: name
└─ IndexLookup: name.index (NULL)
1, 'a', 1
2, 'b', 2
3, 'c', 3

# Deleting by arbitrary predicate works.
> BEGIN
[plan]> DELETE FROM name WHERE id >= 5 - 2 OR (value LIKE 'a') IS NULL
> SELECT * FROM name
> ROLLBACK
---
Delete: name
└─ Scan: name (name.id > 3 OR name.id = 3 OR name.value LIKE 'a' IS NULL)
1, 'a', 1
2, 'b', 2

# Other types error, except NULL which is equivalent to false.
!> DELETE FROM name WHERE 0
!> DELETE FROM name WHERE 1
!> DELETE FROM name WHERE 3.14
!> DELETE FROM name WHERE NaN
!> DELETE FROM name WHERE ''
!> DELETE FROM name WHERE 'true'
---
Error: invalid input: filter returned 0, expected boolean
Error: invalid input: filter returned 1, expected boolean
Error: invalid input: filter returned 3.14, expected boolean
Error: invalid input: filter returned NaN, expected boolean
Error: invalid input: filter returned '', expected boolean
Error: invalid input: filter returned 'true', expected boolean

> DELETE FROM name WHERE NULL
> SELECT * FROM name
---
0, NULL, NULL
1, 'a', 1
2, 'b', 2
3, 'c', 3

# Bare WHERE errors.
!> DELETE FROM name WHERE
---
Error: invalid input: unexpected end of input

# Missing column errors.
!> DELETE FROM name WHERE missing = 'foo'
---
Error: invalid input: unknown column missing


================================================
FILE: src/sql/testscripts/writes/insert
================================================
# Tests basic INSERT functionality.

> CREATE TABLE name (id INT PRIMARY KEY, value STRING)
---
ok

# INSERT writes a row to the table, and returns the number of rows.
[plan,result,ops]> INSERT INTO name VALUES (1, 'a')
---
Insert: name
└─ Values: 1, 'a'
set mvcc:NextVersion → 3 ["\x00" → "\x03"]
set mvcc:TxnActive(2) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x02" → ""]
set mvcc:TxnWrite(2, sql:Row(name, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 1), 2) → 1,'a' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
delete mvcc:TxnWrite(2, sql:Row(name, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnActive(2) ["\x01\x00\x00\x00\x00\x00\x00\x00\x02"]
Insert { count: 1 }

# It can also write multiple rows.
[plan,result,ops]> INSERT INTO name VALUES (2, 'b'), (3, 'c'), (4, 'd')
---
Insert: name
└─ Values: 3 rows
set mvcc:NextVersion → 4 ["\x00" → "\x04"]
set mvcc:TxnActive(3) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x03" → ""]
set mvcc:TxnWrite(3, sql:Row(name, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 2), 3) → 2,'b' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x06\x02\x02\x04\x04\x01b"]
set mvcc:TxnWrite(3, sql:Row(name, 3)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 3), 3) → 3,'c' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x06\x02\x02\x06\x04\x01c"]
set mvcc:TxnWrite(3, sql:Row(name, 4)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 4), 3) → 4,'d' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x06\x02\x02\x08\x04\x01d"]
delete mvcc:TxnWrite(3, sql:Row(name, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnWrite(3, sql:Row(name, 3)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00"]
delete mvcc:TxnWrite(3, sql:Row(name, 4)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00"]
delete mvcc:TxnActive(3) ["\x01\x00\x00\x00\x00\x00\x00\x00\x03"]
Insert { count: 3 }

> SELECT * FROM name
---
1, 'a'
2, 'b'
3, 'c'
4, 'd'

dump
---
mvcc:NextVersion → 4 ["\x00" → "\x04"]
mvcc:Version(sql:Table(name), 1) → CREATE TABLE name ( id INTEGER PRIMARY KEY, value STRING DEFAULT NULL ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1d\x04name\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x01\x01\x00\x00\x00\x00"]
mvcc:Version(sql:Row(name, 1), 2) → 1,'a' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(name, 2), 3) → 2,'b' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x06\x02\x02\x04\x04\x01b"]
mvcc:Version(sql:Row(name, 3), 3) → 3,'c' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x06\x02\x02\x06\x04\x01c"]
mvcc:Version(sql:Row(name, 4), 3) → 4,'d' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x06\x02\x02\x08\x04\x01d"]

# INSERTs can use expressions, but only constant ones.
> INSERT INTO name VALUES (2^2+1, 'abc')
> SELECT * FROM name
---
1, 'a'
2, 'b'
3, 'c'
4, 'd'
5, 'abc'

!> INSERT INTO name VALUES (id + 2, 'abc')
---
Error: invalid input: expression must be constant, found column id

# INSERTs with too many columns errors. Fewer are tested by insert_default.
!> INSERT INTO name VALUES (6, 'e', NULL)
---
Error: invalid input: too many values for table name

# Bare insert errors, as does no values.
!> INSERT
!> INSERT INTO
!> INSERT INTO name
---
Error: invalid input: unexpected end of input
Error: invalid input: unexpected end of input
Error: invalid input: unexpected end of input

# Unknown table or column errors.
!> INSERT INTO foo VALUES (1)
!> INSERT INTO name (id, foo) VALUES (1, 'bar')
---
Error: invalid input: table foo does not exist
Error: invalid input: unknown column foo in table name

# Multiple tables errors.
!> INSERT INTO name, other VALUES (1)
---
Error: invalid input: expected token VALUES, found ,

# Specifying the same column multiple times errors.
!> INSERT INTO name (id, value, value) VALUES (6, 'e', 'f')
---
Error: invalid input: column value given multiple times


================================================
FILE: src/sql/testscripts/writes/insert_datatypes
================================================
# Tests INSERT of all datatypes.

# Create columns with all datatypes.
> CREATE TABLE datatypes ( \
    id INTEGER PRIMARY KEY, \
    "bool" BOOL, \
    "int" INT, \
    "float" FLOAT, \
    "string" STRING \
)
---
ok

# Booleans.
> BEGIN
> INSERT INTO datatypes (id, "bool") VALUES (1, true)
> INSERT INTO datatypes (id, "bool") VALUES (2, false)
> INSERT INTO datatypes (id, "bool") VALUES (3, NULL)
---
ok

> SELECT * FROM datatypes
> ROLLBACK
---
1, TRUE, NULL, NULL, NULL
2, FALSE, NULL, NULL, NULL
3, NULL, NULL, NULL, NULL

!> INSERT INTO datatypes (id, "bool") VALUES (0, 1)
!> INSERT INTO datatypes (id, "bool") VALUES (0, 3.14)
!> INSERT INTO datatypes (id, "bool") VALUES (0, 'false')
---
Error: invalid input: invalid datatype INTEGER for BOOLEAN column bool
Error: invalid input: invalid datatype FLOAT for BOOLEAN column bool
Error: invalid input: invalid datatype STRING for BOOLEAN column bool

# Integers.
> BEGIN
> INSERT INTO datatypes (id, "int") VALUES (1, 1)
> INSERT INTO datatypes (id, "int") VALUES (2, 0)
> INSERT INTO datatypes (id, "int") VALUES (3, -1)
> INSERT INTO datatypes (id, "int") VALUES (4, 9223372036854775807)
> INSERT INTO datatypes (id, "int") VALUES (5, -9223372036854775807)
> INSERT INTO datatypes (id, "int") VALUES (6, NULL)
---
ok

> SELECT * FROM datatypes
> ROLLBACK
---
1, NULL, 1, NULL, NULL
2, NULL, 0, NULL, NULL
3, NULL, -1, NULL, NULL
4, NULL, 9223372036854775807, NULL, NULL
5, NULL, -9223372036854775807, NULL, NULL
6, NULL, NULL, NULL, NULL

!> INSERT INTO datatypes (id, "int") VALUES (0, false)
!> INSERT INTO datatypes (id, "int") VALUES (0, 3.0)
!> INSERT INTO datatypes (id, "int") VALUES (0, '0')
---
Error: invalid input: invalid datatype BOOLEAN for INTEGER column int
Error: invalid input: invalid datatype FLOAT for INTEGER column int
Error: invalid input: invalid datatype STRING for INTEGER column int

# Floats. -0.0 and -NaN is normalized as 0.0 and NaN.
> BEGIN
> INSERT INTO datatypes (id, "float") VALUES (1, 3.14)
> INSERT INTO datatypes (id, "float") VALUES (2, -3.14)
> INSERT INTO datatypes (id, "float") VALUES (3, 0.0)
> INSERT INTO datatypes (id, "float") VALUES (4, -0.0)
> INSERT INTO datatypes (id, "float") VALUES (5, 1.23456789012345e308)
> INSERT INTO datatypes (id, "float") VALUES (6, -1.23456789012345e308)
> INSERT INTO datatypes (id, "float") VALUES (7, INFINITY)
> INSERT INTO datatypes (id, "float") VALUES (8, -INFINITY)
> INSERT INTO datatypes (id, "float") VALUES (9, -NAN)
> INSERT INTO datatypes (id, "float") VALUES (10, NAN)
> INSERT INTO datatypes (id, "float") VALUES (11, NULL)
---
ok

> SELECT * FROM datatypes
> ROLLBACK
---
1, NULL, NULL, 3.14, NULL
2, NULL, NULL, -3.14, NULL
3, NULL, NULL, 0.0, NULL
4, NULL, NULL, 0.0, NULL
5, NULL, NULL, 1.23456789012345e308, NULL
6, NULL, NULL, -1.23456789012345e308, NULL
7, NULL, NULL, inf, NULL
8, NULL, NULL, -inf, NULL
9, NULL, NULL, NaN, NULL
10, NULL, NULL, NaN, NULL
11, NULL, NULL, NULL, NULL

!> INSERT INTO datatypes (id, "float") VALUES (0, false)
!> INSERT INTO datatypes (id, "float") VALUES (0, 3)
!> INSERT INTO datatypes (id, "float") VALUES (0, '0')
---
Error: invalid input: invalid datatype BOOLEAN for FLOAT column float
Error: invalid input: invalid datatype INTEGER for FLOAT column float
Error: invalid input: invalid datatype STRING for FLOAT column float

# Strings.
> BEGIN
> INSERT INTO datatypes (id, "string") VALUES (1, '')
> INSERT INTO datatypes (id, "string") VALUES (2, '  ')
> INSERT INTO datatypes (id, "string") VALUES (3, 'abc')
> INSERT INTO datatypes (id, "string") VALUES (4, 'Hi! 👋')
> INSERT INTO datatypes (id, "string") VALUES (5, NULL)
---
ok

> SELECT * FROM datatypes
> ROLLBACK
---
1, NULL, NULL, NULL, ''
2, NULL, NULL, NULL, '  '
3, NULL, NULL, NULL, 'abc'
4, NULL, NULL, NULL, 'Hi! 👋'
5, NULL, NULL, NULL, NULL

!> INSERT INTO datatypes (id, "string") VALUES (0, false)
!> INSERT INTO datatypes (id, "string") VALUES (0, 3)
!> INSERT INTO datatypes (id, "string") VALUES (0, 3.14)
---
Error: invalid input: invalid datatype BOOLEAN for STRING column string
Error: invalid input: invalid datatype INTEGER for STRING column string
Error: invalid input: invalid datatype FLOAT for STRING column string


================================================
FILE: src/sql/testscripts/writes/insert_default
================================================
# Tests INSERT handling of DEFAULT values.

> CREATE TABLE defaults ( \
    id INTEGER PRIMARY KEY, \
    required BOOLEAN NOT NULL, \
    "null" BOOLEAN, \
    "boolean" BOOLEAN DEFAULT TRUE, \
    "float" FLOAT DEFAULT 3.14, \
    "integer" INTEGER DEFAULT 7, \
    "string" STRING DEFAULT 'foo' \
)
---
ok

# INSERT without specifying default columns fills in defaults.
> INSERT INTO defaults (id, required) VALUES (1, true)
> INSERT INTO defaults VALUES (2, false)
---
ok

> SELECT * FROM defaults
---
1, TRUE, NULL, TRUE, 3.14, 7, 'foo'
2, FALSE, NULL, TRUE, 3.14, 7, 'foo'

# INSERT only specifying some default columns fills in rest.
> INSERT INTO defaults ("integer", id, "null", required) VALUES (9, 3, NULL, false)
---
ok

> SELECT * FROM defaults
---
1, TRUE, NULL, TRUE, 3.14, 7, 'foo'
2, FALSE, NULL, TRUE, 3.14, 7, 'foo'
3, FALSE, NULL, TRUE, 3.14, 9, 'foo'

# Using a variable number of values works.
> INSERT INTO defaults VALUES (4, false, NULL, false), (5, true), (6, true, false, true, 3.14, 9, 'bar')
---
ok

> SELECT * FROM defaults
---
1, TRUE, NULL, TRUE, 3.14, 7, 'foo'
2, FALSE, NULL, TRUE, 3.14, 7, 'foo'
3, FALSE, NULL, TRUE, 3.14, 9, 'foo'
4, FALSE, NULL, FALSE, 3.14, 7, 'foo'
5, TRUE, NULL, TRUE, 3.14, 7, 'foo'
6, TRUE, FALSE, TRUE, 3.14, 9, 'bar'

# INSERT with all NULLs does not yield default values.
> INSERT INTO defaults VALUES (7, false, NULL, NULL, NULL, NULL, NULL)
---
ok

> SELECT * FROM defaults
---
1, TRUE, NULL, TRUE, 3.14, 7, 'foo'
2, FALSE, NULL, TRUE, 3.14, 7, 'foo'
3, FALSE, NULL, TRUE, 3.14, 9, 'foo'
4, FALSE, NULL, FALSE, 3.14, 7, 'foo'
5, TRUE, NULL, TRUE, 3.14, 7, 'foo'
6, TRUE, FALSE, TRUE, 3.14, 9, 'bar'
7, FALSE, NULL, NULL, NULL, NULL, NULL

# Errors if required column isn't given.
!> INSERT INTO defaults VALUES (8)
---
Error: invalid input: no value given for column required with no default


================================================
FILE: src/sql/testscripts/writes/insert_index
================================================
# Tests INSERT index writes.

> CREATE TABLE "index" ( \
    id INT PRIMARY KEY, \
    "bool" BOOL INDEX, \
    "int" INT INDEX, \
    "float" FLOAT INDEX, \
    "string" STRING INDEX \
)
---
ok

# An INSERT writes to all indexes.
[ops]> INSERT INTO "index" VALUES (1, TRUE, 7, 3.14, 'foo')
---
set mvcc:NextVersion → 3 ["\x00" → "\x03"]
set mvcc:TxnActive(2) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x02" → ""]
set mvcc:TxnWrite(2, sql:Row(index, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 1), 2) → 1,TRUE,7,3.14,'foo' ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x15\x05\x02\x02\x01\x01\x02\x0e\x03\x1f\x85\xebQ\xb8\x1e\t@\x04\x03foo"]
set mvcc:TxnWrite(2, sql:Index(index.bool, TRUE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, TRUE), 2) → 1 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(2, sql:Index(index.int, 7)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, 7), 2) → 1 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(2, sql:Index(index.float, 3.14)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, 3.14), 2) → 1 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(2, sql:Index(index.string, 'foo')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, 'foo'), 2) → 1 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x03\x01\x02\x02"]
delete mvcc:TxnWrite(2, sql:Index(index.bool, TRUE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00"]
delete mvcc:TxnWrite(2, sql:Index(index.float, 3.14)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00"]
delete mvcc:TxnWrite(2, sql:Index(index.int, 7)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00"]
delete mvcc:TxnWrite(2, sql:Index(index.string, 'foo')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(2, sql:Row(index, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnActive(2) ["\x01\x00\x00\x00\x00\x00\x00\x00\x02"]

# Another insert with the same values adds to the index entries.
[ops]> INSERT INTO "index" VALUES (2, TRUE, 7, 3.14, 'foo')
---
set mvcc:NextVersion → 4 ["\x00" → "\x04"]
set mvcc:TxnActive(3) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x03" → ""]
set mvcc:TxnWrite(3, sql:Row(index, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 2), 3) → 2,TRUE,7,3.14,'foo' ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x15\x05\x02\x04\x01\x01\x02\x0e\x03\x1f\x85\xebQ\xb8\x1e\t@\x04\x03foo"]
set mvcc:TxnWrite(3, sql:Index(index.bool, TRUE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, TRUE), 3) → 1,2 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(3, sql:Index(index.int, 7)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, 7), 3) → 1,2 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(3, sql:Index(index.float, 3.14)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, 3.14), 3) → 1,2 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(3, sql:Index(index.string, 'foo')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, 'foo'), 3) → 1,2 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x05\x02\x02\x02\x02\x04"]
delete mvcc:TxnWrite(3, sql:Index(index.bool, TRUE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00"]
delete mvcc:TxnWrite(3, sql:Index(index.float, 3.14)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00"]
delete mvcc:TxnWrite(3, sql:Index(index.int, 7)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00"]
delete mvcc:TxnWrite(3, sql:Index(index.string, 'foo')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(3, sql:Row(index, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(3) ["\x01\x00\x00\x00\x00\x00\x00\x00\x03"]

# An insert with different values writes new index entries.
[ops]> INSERT INTO "index" VALUES (3, FALSE, 0, 2.718, '')
---
set mvcc:NextVersion → 5 ["\x00" → "\x05"]
set mvcc:TxnActive(4) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x04" → ""]
set mvcc:TxnWrite(4, sql:Row(index, 3)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 3), 4) → 3,FALSE,0,2.718,'' ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x12\x05\x02\x06\x01\x00\x02\x00\x03X9\xb4\xc8v\xbe\x05@\x04\x00"]
set mvcc:TxnWrite(4, sql:Index(index.bool, FALSE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, FALSE), 4) → 3 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x06"]
set mvcc:TxnWrite(4, sql:Index(index.int, 0)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, 0), 4) → 3 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x06"]
set mvcc:TxnWrite(4, sql:Index(index.float, 2.718)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, 2.718), 4) → 3 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x06"]
set mvcc:TxnWrite(4, sql:Index(index.string, '')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, ''), 4) → 3 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x06"]
delete mvcc:TxnWrite(4, sql:Index(index.bool, FALSE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00"]
delete mvcc:TxnWrite(4, sql:Index(index.float, 2.718)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00"]
delete mvcc:TxnWrite(4, sql:Index(index.int, 0)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(4, sql:Index(index.string, '')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(4, sql:Row(index, 3)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00"]
delete mvcc:TxnActive(4) ["\x01\x00\x00\x00\x00\x00\x00\x00\x04"]

# Inserts with NULLS adds NULL entries. These are used for IS NULL queries.
[ops]> INSERT INTO "index" VALUES (4), (5)
---
set mvcc:NextVersion → 6 ["\x00" → "\x06"]
set mvcc:TxnActive(5) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x05" → ""]
set mvcc:TxnWrite(5, sql:Row(index, 4)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 4), 5) → 4,NULL,NULL,NULL,NULL ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x07\x05\x02\x08\x00\x00\x00\x00"]
set mvcc:TxnWrite(5, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 5) → 4 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x03\x01\x02\x08"]
set mvcc:TxnWrite(5, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 5) → 4 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x03\x01\x02\x08"]
set mvcc:TxnWrite(5, sql:Index(index.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, NULL), 5) → 4 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x03\x01\x02\x08"]
set mvcc:TxnWrite(5, sql:Index(index.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, NULL), 5) → 4 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x03\x01\x02\x08"]
set mvcc:TxnWrite(5, sql:Row(index, 5)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x05\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 5), 5) → 5,NULL,NULL,NULL,NULL ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x07\x05\x02\n\x00\x00\x00\x00"]
set mvcc:TxnWrite(5, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 5) → 4,5 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x05\x02\x02\x08\x02\n"]
set mvcc:TxnWrite(5, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 5) → 4,5 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x05\x02\x02\x08\x02\n"]
set mvcc:TxnWrite(5, sql:Index(index.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, NULL), 5) → 4,5 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x05\x02\x02\x08\x02\n"]
set mvcc:TxnWrite(5, sql:Index(index.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, NULL), 5) → 4,5 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05" → "\x01\x05\x02\x02\x08\x02\n"]
delete mvcc:TxnWrite(5, sql:Index(index.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(5, sql:Index(index.float, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(5, sql:Index(index.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(5, sql:Index(index.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(5, sql:Row(index, 4)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00"]
delete mvcc:TxnWrite(5, sql:Row(index, 5)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x05\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x05\x00\x00"]
delete mvcc:TxnActive(5) ["\x01\x00\x00\x00\x00\x00\x00\x00\x05"]

# Float NaNs are considered equal and indexed.
[ops]> INSERT INTO "index" (id, "float") VALUES (6, NAN), (7, NAN)
---
set mvcc:NextVersion → 7 ["\x00" → "\x07"]
set mvcc:TxnActive(6) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x06" → ""]
set mvcc:TxnWrite(6, sql:Row(index, 6)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x06\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 6), 6) → 6,NULL,NULL,NaN,NULL ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x06\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x0f\x05\x02\x0c\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf8\x7f\x00"]
set mvcc:TxnWrite(6, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 6) → 4,5,6 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x07\x03\x02\x08\x02\n\x02\x0c"]
set mvcc:TxnWrite(6, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 6) → 4,5,6 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x07\x03\x02\x08\x02\n\x02\x0c"]
set mvcc:TxnWrite(6, sql:Index(index.float, NaN)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, NaN), 6) → 6 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x03\x01\x02\x0c"]
set mvcc:TxnWrite(6, sql:Index(index.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, NULL), 6) → 4,5,6 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x07\x03\x02\x08\x02\n\x02\x0c"]
set mvcc:TxnWrite(6, sql:Row(index, 7)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 7), 6) → 7,NULL,NULL,NaN,NULL ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x0f\x05\x02\x0e\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf8\x7f\x00"]
set mvcc:TxnWrite(6, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 6) → 4,5,6,7 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\t\x04\x02\x08\x02\n\x02\x0c\x02\x0e"]
set mvcc:TxnWrite(6, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 6) → 4,5,6,7 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\t\x04\x02\x08\x02\n\x02\x0c\x02\x0e"]
set mvcc:TxnWrite(6, sql:Index(index.float, NaN)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, NaN), 6) → 6,7 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x05\x02\x02\x0c\x02\x0e"]
set mvcc:TxnWrite(6, sql:Index(index.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, NULL), 6) → 4,5,6,7 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\t\x04\x02\x08\x02\n\x02\x0c\x02\x0e"]
delete mvcc:TxnWrite(6, sql:Index(index.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(6, sql:Index(index.float, NaN)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(6, sql:Index(index.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(6, sql:Index(index.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(6, sql:Row(index, 6)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x06\x00\x00"]
delete mvcc:TxnWrite(6, sql:Row(index, 7)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00"]
delete mvcc:TxnActive(6) ["\x01\x00\x00\x00\x00\x00\x00\x00\x06"]

# Float 0.0 and -0.0 are normalized as 0.0 and indexed as such.
[ops]> INSERT INTO "index" (id, "float") VALUES (8, -0.0), (9, 0.0)
---
set mvcc:NextVersion → 8 ["\x00" → "\x08"]
set mvcc:TxnActive(7) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x07" → ""]
set mvcc:TxnWrite(7, sql:Row(index, 8)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x08\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 8), 7) → 8,NULL,NULL,0.0,NULL ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x0f\x05\x02\x10\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"]
set mvcc:TxnWrite(7, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 7) → 4,5,6,7,8 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x0b\x05\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10"]
set mvcc:TxnWrite(7, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 7) → 4,5,6,7,8 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x0b\x05\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10"]
set mvcc:TxnWrite(7, sql:Index(index.float, 0.0)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, 0.0), 7) → 8 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x03\x01\x02\x10"]
set mvcc:TxnWrite(7, sql:Index(index.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, NULL), 7) → 4,5,6,7,8 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x0b\x05\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10"]
set mvcc:TxnWrite(7, sql:Row(index, 9)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\t\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 9), 7) → 9,NULL,NULL,0.0,NULL ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\t\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x0f\x05\x02\x12\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"]
set mvcc:TxnWrite(7, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 7) → 4,5,6,7,8,9 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\r\x06\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12"]
set mvcc:TxnWrite(7, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 7) → 4,5,6,7,8,9 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\r\x06\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12"]
set mvcc:TxnWrite(7, sql:Index(index.float, 0.0)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, 0.0), 7) → 8,9 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x05\x02\x02\x10\x02\x12"]
set mvcc:TxnWrite(7, sql:Index(index.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, NULL), 7) → 4,5,6,7,8,9 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\r\x06\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12"]
delete mvcc:TxnWrite(7, sql:Index(index.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(7, sql:Index(index.float, 0.0)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(7, sql:Index(index.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(7, sql:Index(index.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(7, sql:Row(index, 8)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x08\x00\x00"]
delete mvcc:TxnWrite(7, sql:Row(index, 9)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\t\x00\x00"]
delete mvcc:TxnActive(7) ["\x01\x00\x00\x00\x00\x00\x00\x00\x07"]

# Float INFINITY is also indexed.
[ops]> INSERT INTO "index" (id, "float") VALUES (10, -INFINITY), (11, INFINITY)
---
set mvcc:NextVersion → 9 ["\x00" → "\t"]
set mvcc:TxnActive(8) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x08" → ""]
set mvcc:TxnWrite(8, sql:Row(index, 10)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\n\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 10), 8) → 10,NULL,NULL,-inf,NULL ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\n\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x0f\x05\x02\x14\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf0\xff\x00"]
set mvcc:TxnWrite(8, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 8) → 4,5,6,7,8,9,10 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x0f\x07\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14"]
set mvcc:TxnWrite(8, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 8) → 4,5,6,7,8,9,10 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x0f\x07\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14"]
set mvcc:TxnWrite(8, sql:Index(index.float, -inf)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x00\xff\x0f\xff\xff\xff\xff\xff\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, -inf), 8) → 10 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x00\xff\x0f\xff\xff\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x14"]
set mvcc:TxnWrite(8, sql:Index(index.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, NULL), 8) → 4,5,6,7,8,9,10 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x0f\x07\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14"]
set mvcc:TxnWrite(8, sql:Row(index, 11)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0b\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 11), 8) → 11,NULL,NULL,inf,NULL ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x0f\x05\x02\x16\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf0\x7f\x00"]
set mvcc:TxnWrite(8, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 8) → 4,5,6,7,8,9,10,11 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x11\x08\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16"]
set mvcc:TxnWrite(8, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 8) → 4,5,6,7,8,9,10,11 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x11\x08\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16"]
set mvcc:TxnWrite(8, sql:Index(index.float, inf)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf0\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, inf), 8) → 11 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf0\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x16"]
set mvcc:TxnWrite(8, sql:Index(index.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, NULL), 8) → 4,5,6,7,8,9,10,11 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x11\x08\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16"]
delete mvcc:TxnWrite(8, sql:Index(index.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(index.float, -inf)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x00\xff\x0f\xff\xff\xff\xff\xff\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(index.float, inf)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf0\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(index.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(index.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Row(index, 10)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\n\x00\x00"]
delete mvcc:TxnWrite(8, sql:Row(index, 11)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0b\x00\x00"]
delete mvcc:TxnActive(8) ["\x01\x00\x00\x00\x00\x00\x00\x00\x08"]

[ops]> INSERT INTO "index" (id, "float") VALUES (12, -INFINITY), (13, INFINITY)
---
set mvcc:NextVersion → 10 ["\x00" → "\n"]
set mvcc:TxnActive(9) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\t" → ""]
set mvcc:TxnWrite(9, sql:Row(index, 12)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0c\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 12), 9) → 12,NULL,NULL,-inf,NULL ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0c\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x0f\x05\x02\x18\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf0\xff\x00"]
set mvcc:TxnWrite(9, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 9) → 4,5,6,7,8,9,10,11,12 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x13\t\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16\x02\x18"]
set mvcc:TxnWrite(9, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 9) → 4,5,6,7,8,9,10,11,12 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x13\t\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16\x02\x18"]
set mvcc:TxnWrite(9, sql:Index(index.float, -inf)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x00\xff\x0f\xff\xff\xff\xff\xff\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, -inf), 9) → 10,12 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x00\xff\x0f\xff\xff\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x05\x02\x02\x14\x02\x18"]
set mvcc:TxnWrite(9, sql:Index(index.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, NULL), 9) → 4,5,6,7,8,9,10,11,12 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x13\t\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16\x02\x18"]
set mvcc:TxnWrite(9, sql:Row(index, 13)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\r\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 13), 9) → 13,NULL,NULL,inf,NULL ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\r\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x0f\x05\x02\x1a\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf0\x7f\x00"]
set mvcc:TxnWrite(9, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 9) → 4,5,6,7,8,9,10,11,12,13 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x15\n\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16\x02\x18\x02\x1a"]
set mvcc:TxnWrite(9, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 9) → 4,5,6,7,8,9,10,11,12,13 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x15\n\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16\x02\x18\x02\x1a"]
set mvcc:TxnWrite(9, sql:Index(index.float, inf)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf0\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, inf), 9) → 11,13 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf0\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x05\x02\x02\x16\x02\x1a"]
set mvcc:TxnWrite(9, sql:Index(index.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, NULL), 9) → 4,5,6,7,8,9,10,11,12,13 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x15\n\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16\x02\x18\x02\x1a"]
delete mvcc:TxnWrite(9, sql:Index(index.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(9, sql:Index(index.float, -inf)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x00\xff\x0f\xff\xff\xff\xff\xff\xff\x00\x00"]
delete mvcc:TxnWrite(9, sql:Index(index.float, inf)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf0\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(9, sql:Index(index.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(9, sql:Index(index.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(9, sql:Row(index, 12)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0c\x00\x00"]
delete mvcc:TxnWrite(9, sql:Row(index, 13)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\r\x00\x00"]
delete mvcc:TxnActive(9) ["\x01\x00\x00\x00\x00\x00\x00\x00\t"]

# Empty strings are considered equal.
[ops]> INSERT INTO "index" (id, "string") VALUES (14, ''), (15, '')
---
set mvcc:NextVersion → 11 ["\x00" → "\x0b"]
set mvcc:TxnActive(10) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\n" → ""]
set mvcc:TxnWrite(10, sql:Row(index, 14)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0e\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 14), 10) → 14,NULL,NULL,NULL,'' ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x08\x05\x02\x1c\x00\x00\x00\x04\x00"]
set mvcc:TxnWrite(10, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 10) → 4,5,6,7,8,9,10,11,12,13,14 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x17\x0b\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16\x02\x18\x02\x1a\x02\x1c"]
set mvcc:TxnWrite(10, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 10) → 4,5,6,7,8,9,10,11,12,13,14 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x17\x0b\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16\x02\x18\x02\x1a\x02\x1c"]
set mvcc:TxnWrite(10, sql:Index(index.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, NULL), 10) → 4,5,14 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x07\x03\x02\x08\x02\n\x02\x1c"]
set mvcc:TxnWrite(10, sql:Index(index.string, '')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, ''), 10) → 3,14 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x05\x02\x02\x06\x02\x1c"]
set mvcc:TxnWrite(10, sql:Row(index, 15)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0f\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 15), 10) → 15,NULL,NULL,NULL,'' ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0f\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x08\x05\x02\x1e\x00\x00\x00\x04\x00"]
set mvcc:TxnWrite(10, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 10) → 4,5,6,7,8,9,10,11,12,13,14,15 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x19\x0c\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16\x02\x18\x02\x1a\x02\x1c\x02\x1e"]
set mvcc:TxnWrite(10, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 10) → 4,5,6,7,8,9,10,11,12,13,14,15 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x19\x0c\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16\x02\x18\x02\x1a\x02\x1c\x02\x1e"]
set mvcc:TxnWrite(10, sql:Index(index.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, NULL), 10) → 4,5,14,15 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\t\x04\x02\x08\x02\n\x02\x1c\x02\x1e"]
set mvcc:TxnWrite(10, sql:Index(index.string, '')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, ''), 10) → 3,14,15 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x07\x03\x02\x06\x02\x1c\x02\x1e"]
delete mvcc:TxnWrite(10, sql:Index(index.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(10, sql:Index(index.float, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(10, sql:Index(index.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(10, sql:Index(index.string, '')) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(10, sql:Row(index, 14)) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0e\x00\x00"]
delete mvcc:TxnWrite(10, sql:Row(index, 15)) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0f\x00\x00"]
delete mvcc:TxnActive(10) ["\x01\x00\x00\x00\x00\x00\x00\x00\n"]

# Case differences are not considered equal
[ops]> INSERT INTO "index" (id, "string") VALUES (16, 'case'), (17, 'CaSe')
---
set mvcc:NextVersion → 12 ["\x00" → "\x0c"]
set mvcc:TxnActive(11) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x0b" → ""]
set mvcc:TxnWrite(11, sql:Row(index, 16)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x10\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 16), 11) → 16,NULL,NULL,NULL,'case' ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x0c\x05\x02 \x00\x00\x00\x04\x04case"]
set mvcc:TxnWrite(11, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 11) → 4,5,6,7,8,9,10,11,12,13,14,15,16 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x1b\r\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16\x02\x18\x02\x1a\x02\x1c\x02\x1e\x02 "]
set mvcc:TxnWrite(11, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 11) → 4,5,6,7,8,9,10,11,12,13,14,15,16 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x1b\r\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16\x02\x18\x02\x1a\x02\x1c\x02\x1e\x02 "]
set mvcc:TxnWrite(11, sql:Index(index.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, NULL), 11) → 4,5,14,15,16 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x0b\x05\x02\x08\x02\n\x02\x1c\x02\x1e\x02 "]
set mvcc:TxnWrite(11, sql:Index(index.string, 'case')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04case\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, 'case'), 11) → 16 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04case\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x03\x01\x02 "]
set mvcc:TxnWrite(11, sql:Row(index, 17)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x11\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 17), 11) → 17,NULL,NULL,NULL,'CaSe' ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x0c\x05\x02\"\x00\x00\x00\x04\x04CaSe"]
set mvcc:TxnWrite(11, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 11) → 4,5,6,7,8,9,10,11,12,13,14,15,16,17 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x1d\x0e\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16\x02\x18\x02\x1a\x02\x1c\x02\x1e\x02 \x02\""]
set mvcc:TxnWrite(11, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 11) → 4,5,6,7,8,9,10,11,12,13,14,15,16,17 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x1d\x0e\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x12\x02\x14\x02\x16\x02\x18\x02\x1a\x02\x1c\x02\x1e\x02 \x02\""]
set mvcc:TxnWrite(11, sql:Index(index.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, NULL), 11) → 4,5,14,15,16,17 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\r\x06\x02\x08\x02\n\x02\x1c\x02\x1e\x02 \x02\""]
set mvcc:TxnWrite(11, sql:Index(index.string, 'CaSe')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04CaSe\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, 'CaSe'), 11) → 17 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04CaSe\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x03\x01\x02\""]
delete mvcc:TxnWrite(11, sql:Index(index.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(11, sql:Index(index.float, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(11, sql:Index(index.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(11, sql:Index(index.string, 'CaSe')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04CaSe\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(11, sql:Index(index.string, 'case')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04case\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(11, sql:Row(index, 16)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x10\x00\x00"]
delete mvcc:TxnWrite(11, sql:Row(index, 17)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x11\x00\x00"]
delete mvcc:TxnActive(11) ["\x01\x00\x00\x00\x00\x00\x00\x00\x0b"]


================================================
FILE: src/sql/testscripts/writes/insert_null
================================================
# Tests nullability handling of INSERT.

# Create a table with NULL constraints.
> CREATE TABLE name ( \
    id INT PRIMARY KEY, \
    "null" STRING NULL, \
    not_null STRING NOT NULL \
)
---
ok

# INSERT with NULL works.
> INSERT INTO name VALUES (1, NULL, 'foo')
---
ok

> SELECT * FROM name
---
1, NULL, 'foo'

# INSERT with NULL into non-NULL columns errors.
!> INSERT INTO name VALUES (NULL, 'foo', 'bar')
!> INSERT INTO name VALUES (2, 'foo', NULL)
---
Error: invalid input: invalid primary key NULL
Error: invalid input: NULL value not allowed for column not_null

# Omitting a NULLable column works.
> INSERT INTO name (id, not_null) VALUES (2, 'foo')
---
ok

> SELECT * FROM name
---
1, NULL, 'foo'
2, NULL, 'foo'


================================================
FILE: src/sql/testscripts/writes/insert_primary_key
================================================
# Tests INSERT primary key handling.

# Boolean.
> CREATE TABLE "bool" (id BOOLEAN PRIMARY KEY)
> INSERT INTO "bool" VALUES (true)
> INSERT INTO "bool" VALUES (false)
> SELECT * FROM "bool";
---
FALSE
TRUE

!> INSERT INTO "bool" VALUES (true)
!> INSERT INTO "bool" VALUES (false)
!> INSERT INTO "bool" VALUES (NULL)
---
Error: invalid input: primary key TRUE already exists
Error: invalid input: primary key FALSE already exists
Error: invalid input: invalid primary key NULL

# Integer.
> CREATE TABLE "int" (id INT PRIMARY KEY)
> INSERT INTO "int" VALUES (1)
> INSERT INTO "int" VALUES (0)
> INSERT INTO "int" VALUES (-1)
> INSERT INTO "int" VALUES (9223372036854775807)
> INSERT INTO "int" VALUES (-9223372036854775807)
> SELECT * FROM "int";
---
-9223372036854775807
-1
0
1
9223372036854775807

!> INSERT INTO "int" VALUES (1)
!> INSERT INTO "int" VALUES (0)
!> INSERT INTO "int" VALUES (-1)
!> INSERT INTO "int" VALUES (9223372036854775807)
!> INSERT INTO "int" VALUES (-9223372036854775807)
!> INSERT INTO "int" VALUES (NULL)
---
Error: invalid input: primary key 1 already exists
Error: invalid input: primary key 0 already exists
Error: invalid input: primary key -1 already exists
Error: invalid input: primary key 9223372036854775807 already exists
Error: invalid input: primary key -9223372036854775807 already exists
Error: invalid input: invalid primary key NULL

# Float. -0.0 is normalized as 0.0.
> CREATE TABLE "float" (id FLOAT PRIMARY KEY)
> INSERT INTO "float" VALUES (3.14)
> INSERT INTO "float" VALUES (-3.14)
> INSERT INTO "float" VALUES (-0.0)
> INSERT INTO "float" VALUES (1.23456789012345e308)
> INSERT INTO "float" VALUES (-1.23456789012345e308)
> INSERT INTO "float" VALUES (INFINITY)
> INSERT INTO "float" VALUES (-INFINITY)
> SELECT * FROM "float";
---
-inf
-1.23456789012345e308
-3.14
0.0
3.14
1.23456789012345e308
inf

!> INSERT INTO "float" VALUES (3.14)
!> INSERT INTO "float" VALUES (-3.14)
!> INSERT INTO "float" VALUES (0.0)
!> INSERT INTO "float" VALUES (-0.0)
!> INSERT INTO "float" VALUES (1.23456789012345e308)
!> INSERT INTO "float" VALUES (-1.23456789012345e308)
!> INSERT INTO "float" VALUES (INFINITY)
!> INSERT INTO "float" VALUES (-INFINITY)
!> INSERT INTO "float" VALUES (NAN)
!> INSERT INTO "float" VALUES (NULL)
---
Error: invalid input: primary key 3.14 already exists
Error: invalid input: primary key -3.14 already exists
Error: invalid input: primary key 0.0 already exists
Error: invalid input: primary key -0.0 already exists
Error: invalid input: primary key 1.23456789012345e308 already exists
Error: invalid input: primary key -1.23456789012345e308 already exists
Error: invalid input: primary key inf already exists
Error: invalid input: primary key -inf already exists
Error: invalid input: invalid primary key NaN
Error: invalid input: invalid primary key NULL

# String.
> CREATE TABLE "string" (id STRING PRIMARY KEY)
> INSERT INTO "string" VALUES ('')
> INSERT INTO "string" VALUES ('  ')
> INSERT INTO "string" VALUES ('abc')
> INSERT INTO "string" VALUES ('ABC')
> INSERT INTO "string" VALUES ('Hi! 👋')
> SELECT * FROM "string";
---
''
'  '
'ABC'
'Hi! 👋'
'abc'

!> INSERT INTO "string" VALUES ('')
!> INSERT INTO "string" VALUES ('  ')
!> INSERT INTO "string" VALUES ('abc')
!> INSERT INTO "string" VALUES ('ABC')
!> INSERT INTO "string" VALUES ('Hi! 👋')
!> INSERT INTO "string" VALUES (NULL)
---
Error: invalid input: primary key '' already exists
Error: invalid input: primary key '  ' already exists
Error: invalid input: primary key 'abc' already exists
Error: invalid input: primary key 'ABC' already exists
Error: invalid input: primary key 'Hi! 👋' already exists
Error: invalid input: invalid primary key NULL


================================================
FILE: src/sql/testscripts/writes/insert_reference
================================================
# Tests INSERT foreign key references.

# Create reference tables for all datatypes.
> CREATE TABLE "bool" (id BOOL PRIMARY KEY)
> INSERT INTO "bool" VALUES (true)

> CREATE TABLE "int" (id INT PRIMARY KEY)
> INSERT INTO "int" VALUES (-1), (0), (1)

> CREATE TABLE "float" (id FLOAT PRIMARY KEY)
> INSERT INTO "float" VALUES (3.14), (0.0), (INFINITY)

> CREATE TABLE "string" (id STRING PRIMARY KEY)
> INSERT INTO "string" VALUES (''), ('foo')

> CREATE TABLE name ( \
    id INT PRIMARY KEY, \
    "bool" BOOL REFERENCES "bool", \
    "int" INT REFERENCES "int", \
    "float" FLOAT REFERENCES "float", \
    "string" STRING REFERENCES "string" \
)
---
ok

# INSERTs with existing references work, and update the index entries.
[ops]> INSERT INTO name VALUES (1, true, 1, 3.14, 'foo')
---
set mvcc:NextVersion → 11 ["\x00" → "\x0b"]
set mvcc:TxnActive(10) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\n" → ""]
set mvcc:TxnWrite(10, sql:Row(name, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 1), 10) → 1,TRUE,1,3.14,'foo' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x15\x05\x02\x02\x01\x01\x02\x02\x03\x1f\x85\xebQ\xb8\x1e\t@\x04\x03foo"]
set mvcc:TxnWrite(10, sql:Index(name.bool, TRUE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01name\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(name.bool, TRUE), 10) → 1 ["\x04\x01name\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(10, sql:Index(name.int, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01name\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(name.int, 1), 10) → 1 ["\x04\x01name\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(10, sql:Index(name.float, 3.14)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01name\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00" → ""]
set mvcc:Version(sql:Index(name.float, 3.14), 10) → 1 ["\x04\x01name\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(10, sql:Index(name.string, 'foo')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01name\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.string, 'foo'), 10) → 1 ["\x04\x01name\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x03\x01\x02\x02"]
delete mvcc:TxnWrite(10, sql:Index(name.bool, TRUE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01name\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00"]
delete mvcc:TxnWrite(10, sql:Index(name.float, 3.14)) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01name\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00"]
delete mvcc:TxnWrite(10, sql:Index(name.int, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01name\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(10, sql:Index(name.string, 'foo')) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01name\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(10, sql:Row(name, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnActive(10) ["\x01\x00\x00\x00\x00\x00\x00\x00\n"]

# INSERTs error on missing references.
!> INSERT INTO name (id, "bool") VALUES (2, FALSE)
!> INSERT INTO name (id, "int") VALUES (2, 7)
!> INSERT INTO name (id, "float") VALUES (2, 2.718)
!> INSERT INTO name (id, "string") VALUES (2, 'bar')
---
Error: invalid input: reference FALSE not in table bool
Error: invalid input: reference 7 not in table int
Error: invalid input: reference 2.718 not in table float
Error: invalid input: reference 'bar' not in table string

# -0.0 is equivalent to 0.0.
[ops]> INSERT INTO name (id, "float") VALUES (2, -0.0)
---
set mvcc:NextVersion → 16 ["\x00" → "\x10"]
set mvcc:TxnActive(15) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x0f" → ""]
set mvcc:TxnWrite(15, sql:Row(name, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 2), 15) → 2,NULL,NULL,0.0,NULL ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f" → "\x01\x0f\x05\x02\x04\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"]
set mvcc:TxnWrite(15, sql:Index(name.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x01name\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.bool, NULL), 15) → 2 ["\x04\x01name\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(15, sql:Index(name.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x01name\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.int, NULL), 15) → 2 ["\x04\x01name\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(15, sql:Index(name.float, 0.0)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x01name\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.float, 0.0), 15) → 2 ["\x04\x01name\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(15, sql:Index(name.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x01name\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.string, NULL), 15) → 2 ["\x04\x01name\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f" → "\x01\x03\x01\x02\x04"]
delete mvcc:TxnWrite(15, sql:Index(name.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x01name\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(15, sql:Index(name.float, 0.0)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x01name\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(15, sql:Index(name.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x01name\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(15, sql:Index(name.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x01name\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(15, sql:Row(name, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(15) ["\x01\x00\x00\x00\x00\x00\x00\x00\x0f"]

# NaN is not valid as a missing reference marker.
!> INSERT INTO name (id, "float") VALUES (3, NAN)
---
Error: invalid input: reference NaN not in table float

# INFINITY is a valid reference.
> INSERT INTO name (id, "float") VALUES (3, INFINITY)
---
ok

# References are case sensitive.
!> INSERT INTO name (id, "string") VALUES (4, 'FOO')
---
Error: invalid input: reference 'FOO' not in table string

# Empty strings are valid references.
> INSERT INTO name (id, "string") VALUES (5, '')
---
ok

# NULLs are valid.
> INSERT INTO name (id) VALUES (6)
---
ok

> SELECT * FROM name
---
1, TRUE, 1, 3.14, 'foo'
2, NULL, NULL, 0.0, NULL
3, NULL, NULL, inf, NULL
5, NULL, NULL, NULL, ''
6, NULL, NULL, NULL, NULL

# Self references are fine.
> CREATE TABLE self (id INT PRIMARY KEY, self_id INT REFERENCES self)
---
ok

[ops]> INSERT INTO self VALUES (1, 1)
---
set mvcc:NextVersion → 23 ["\x00" → "\x17"]
set mvcc:TxnActive(22) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x16" → ""]
set mvcc:TxnWrite(22, sql:Row(self, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x16\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(self, 1), 22) → 1,1 ["\x04\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x16" → "\x01\x05\x02\x02\x02\x02\x02"]
set mvcc:TxnWrite(22, sql:Index(self.self_id, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x16\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(self.self_id, 1), 22) → 1 ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x16" → "\x01\x03\x01\x02\x02"]
delete mvcc:TxnWrite(22, sql:Index(self.self_id, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x16\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(22, sql:Row(self, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x16\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnActive(22) ["\x01\x00\x00\x00\x00\x00\x00\x00\x16"]

[ops]> INSERT INTO self VALUES (2, 1)
---
set mvcc:NextVersion → 24 ["\x00" → "\x18"]
set mvcc:TxnActive(23) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x17" → ""]
set mvcc:TxnWrite(23, sql:Row(self, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x17\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(self, 2), 23) → 2,1 ["\x04\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17" → "\x01\x05\x02\x02\x04\x02\x02"]
set mvcc:TxnWrite(23, sql:Index(self.self_id, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x17\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(self.self_id, 1), 23) → 1,2 ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17" → "\x01\x05\x02\x02\x02\x02\x04"]
delete mvcc:TxnWrite(23, sql:Index(self.self_id, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x17\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(23, sql:Row(self, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x17\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(23) ["\x01\x00\x00\x00\x00\x00\x00\x00\x17"]

[ops]> INSERT INTO self VALUES (3, NULL)
---
set mvcc:NextVersion → 25 ["\x00" → "\x19"]
set mvcc:TxnActive(24) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x18" → ""]
set mvcc:TxnWrite(24, sql:Row(self, 3)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x18\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00" → ""]
set mvcc:Version(sql:Row(self, 3), 24) → 3,NULL ["\x04\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18" → "\x01\x04\x02\x02\x06\x00"]
set mvcc:TxnWrite(24, sql:Index(self.self_id, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x18\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(self.self_id, NULL), 24) → 3 ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18" → "\x01\x03\x01\x02\x06"]
delete mvcc:TxnWrite(24, sql:Index(self.self_id, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x18\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(24, sql:Row(self, 3)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x18\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00"]
delete mvcc:TxnActive(24) ["\x01\x00\x00\x00\x00\x00\x00\x00\x18"]

!> INSERT INTO self VALUES (4, 9)
---
Error: invalid input: reference 9 not in table self


================================================
FILE: src/sql/testscripts/writes/insert_unique
================================================
# Tests INSERT index writes.

> CREATE TABLE "unique" ( \
    id INT PRIMARY KEY, \
    "bool" BOOL UNIQUE, \
    "int" INT UNIQUE, \
    "float" FLOAT UNIQUE, \
    "string" STRING UNIQUE \
)
---
ok

# An INSERT writes to all indexes.
[ops]> INSERT INTO "unique" VALUES (1, TRUE, 7, 3.14, 'foo')
---
set mvcc:NextVersion → 3 ["\x00" → "\x03"]
set mvcc:TxnActive(2) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x02" → ""]
set mvcc:TxnWrite(2, sql:Row(unique, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 1), 2) → 1,TRUE,7,3.14,'foo' ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x15\x05\x02\x02\x01\x01\x02\x0e\x03\x1f\x85\xebQ\xb8\x1e\t@\x04\x03foo"]
set mvcc:TxnWrite(2, sql:Index(unique.bool, TRUE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, TRUE), 2) → 1 ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(2, sql:Index(unique.int, 7)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, 7), 2) → 1 ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(2, sql:Index(unique.float, 3.14)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, 3.14), 2) → 1 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(2, sql:Index(unique.string, 'foo')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, 'foo'), 2) → 1 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x03\x01\x02\x02"]
delete mvcc:TxnWrite(2, sql:Index(unique.bool, TRUE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00"]
delete mvcc:TxnWrite(2, sql:Index(unique.float, 3.14)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00"]
delete mvcc:TxnWrite(2, sql:Index(unique.int, 7)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00"]
delete mvcc:TxnWrite(2, sql:Index(unique.string, 'foo')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(2, sql:Row(unique, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnActive(2) ["\x01\x00\x00\x00\x00\x00\x00\x00\x02"]

# Another insert with the same values errors for all indexes.
!> INSERT INTO "unique" (id, "bool") VALUES (2, TRUE)
!> INSERT INTO "unique" (id, "int") VALUES (2, 7)
!> INSERT INTO "unique" (id, "float") VALUES (2, 3.14)
!> INSERT INTO "unique" (id, "string") VALUES (2, 'foo')
---
Error: invalid input: value TRUE already in unique column bool
Error: invalid input: value 7 already in unique column int
Error: invalid input: value 3.14 already in unique column float
Error: invalid input: value 'foo' already in unique column string

# An insert with different values writes new index entries.
[ops]> INSERT INTO "unique" VALUES (3, FALSE, 0, 2.718, 'bar')
---
set mvcc:NextVersion → 8 ["\x00" → "\x08"]
set mvcc:TxnActive(7) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x07" → ""]
set mvcc:TxnWrite(7, sql:Row(unique, 3)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 3), 7) → 3,FALSE,0,2.718,'bar' ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x15\x05\x02\x06\x01\x00\x02\x00\x03X9\xb4\xc8v\xbe\x05@\x04\x03bar"]
set mvcc:TxnWrite(7, sql:Index(unique.bool, FALSE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, FALSE), 7) → 3 ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x03\x01\x02\x06"]
set mvcc:TxnWrite(7, sql:Index(unique.int, 0)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, 0), 7) → 3 ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x03\x01\x02\x06"]
set mvcc:TxnWrite(7, sql:Index(unique.float, 2.718)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, 2.718), 7) → 3 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x03\x01\x02\x06"]
set mvcc:TxnWrite(7, sql:Index(unique.string, 'bar')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04bar\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, 'bar'), 7) → 3 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04bar\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x03\x01\x02\x06"]
delete mvcc:TxnWrite(7, sql:Index(unique.bool, FALSE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00"]
delete mvcc:TxnWrite(7, sql:Index(unique.float, 2.718)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00"]
delete mvcc:TxnWrite(7, sql:Index(unique.int, 0)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(7, sql:Index(unique.string, 'bar')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04bar\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(7, sql:Row(unique, 3)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00"]
delete mvcc:TxnActive(7) ["\x01\x00\x00\x00\x00\x00\x00\x00\x07"]

# Inserts with NULLS adds NULL entries. Duplicates are allowed
[ops]> INSERT INTO "unique" VALUES (4)
---
set mvcc:NextVersion → 9 ["\x00" → "\t"]
set mvcc:TxnActive(8) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x08" → ""]
set mvcc:TxnWrite(8, sql:Row(unique, 4)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 4), 8) → 4,NULL,NULL,NULL,NULL ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x07\x05\x02\x08\x00\x00\x00\x00"]
set mvcc:TxnWrite(8, sql:Index(unique.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, NULL), 8) → 4 ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x08"]
set mvcc:TxnWrite(8, sql:Index(unique.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, NULL), 8) → 4 ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x08"]
set mvcc:TxnWrite(8, sql:Index(unique.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NULL), 8) → 4 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x08"]
set mvcc:TxnWrite(8, sql:Index(unique.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, NULL), 8) → 4 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x08"]
delete mvcc:TxnWrite(8, sql:Index(unique.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(unique.float, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(unique.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(unique.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Row(unique, 4)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x04\x00\x00"]
delete mvcc:TxnActive(8) ["\x01\x00\x00\x00\x00\x00\x00\x00\x08"]

[ops]> INSERT INTO "unique" VALUES (5)
---
set mvcc:NextVersion → 10 ["\x00" → "\n"]
set mvcc:TxnActive(9) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\t" → ""]
set mvcc:TxnWrite(9, sql:Row(unique, 5)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x05\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 5), 9) → 5,NULL,NULL,NULL,NULL ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x07\x05\x02\n\x00\x00\x00\x00"]
set mvcc:TxnWrite(9, sql:Index(unique.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, NULL), 9) → 4,5 ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x05\x02\x02\x08\x02\n"]
set mvcc:TxnWrite(9, sql:Index(unique.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, NULL), 9) → 4,5 ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x05\x02\x02\x08\x02\n"]
set mvcc:TxnWrite(9, sql:Index(unique.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NULL), 9) → 4,5 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x05\x02\x02\x08\x02\n"]
set mvcc:TxnWrite(9, sql:Index(unique.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, NULL), 9) → 4,5 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\t" → "\x01\x05\x02\x02\x08\x02\n"]
delete mvcc:TxnWrite(9, sql:Index(unique.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(9, sql:Index(unique.float, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(9, sql:Index(unique.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(9, sql:Index(unique.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(9, sql:Row(unique, 5)) ["\x03\x00\x00\x00\x00\x00\x00\x00\t\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x05\x00\x00"]
delete mvcc:TxnActive(9) ["\x01\x00\x00\x00\x00\x00\x00\x00\t"]

# Float NaNs are considered different and allowed.
[ops]> INSERT INTO "unique" (id, "float") VALUES (6, NAN)
---
set mvcc:NextVersion → 11 ["\x00" → "\x0b"]
set mvcc:TxnActive(10) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\n" → ""]
set mvcc:TxnWrite(10, sql:Row(unique, 6)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x06\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 6), 10) → 6,NULL,NULL,NaN,NULL ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x06\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x0f\x05\x02\x0c\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf8\x7f\x00"]
set mvcc:TxnWrite(10, sql:Index(unique.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, NULL), 10) → 4,5,6 ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x07\x03\x02\x08\x02\n\x02\x0c"]
set mvcc:TxnWrite(10, sql:Index(unique.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, NULL), 10) → 4,5,6 ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x07\x03\x02\x08\x02\n\x02\x0c"]
set mvcc:TxnWrite(10, sql:Index(unique.float, NaN)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NaN), 10) → 6 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x03\x01\x02\x0c"]
set mvcc:TxnWrite(10, sql:Index(unique.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, NULL), 10) → 4,5,6 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\n" → "\x01\x07\x03\x02\x08\x02\n\x02\x0c"]
delete mvcc:TxnWrite(10, sql:Index(unique.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(10, sql:Index(unique.float, NaN)) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(10, sql:Index(unique.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(10, sql:Index(unique.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(10, sql:Row(unique, 6)) ["\x03\x00\x00\x00\x00\x00\x00\x00\n\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x06\x00\x00"]
delete mvcc:TxnActive(10) ["\x01\x00\x00\x00\x00\x00\x00\x00\n"]

[ops]> INSERT INTO "unique" (id, "float") VALUES (7, NAN)
---
set mvcc:NextVersion → 12 ["\x00" → "\x0c"]
set mvcc:TxnActive(11) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x0b" → ""]
set mvcc:TxnWrite(11, sql:Row(unique, 7)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 7), 11) → 7,NULL,NULL,NaN,NULL ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x0f\x05\x02\x0e\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf8\x7f\x00"]
set mvcc:TxnWrite(11, sql:Index(unique.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, NULL), 11) → 4,5,6,7 ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\t\x04\x02\x08\x02\n\x02\x0c\x02\x0e"]
set mvcc:TxnWrite(11, sql:Index(unique.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, NULL), 11) → 4,5,6,7 ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\t\x04\x02\x08\x02\n\x02\x0c\x02\x0e"]
set mvcc:TxnWrite(11, sql:Index(unique.float, NaN)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NaN), 11) → 6,7 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x05\x02\x02\x0c\x02\x0e"]
set mvcc:TxnWrite(11, sql:Index(unique.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, NULL), 11) → 4,5,6,7 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\t\x04\x02\x08\x02\n\x02\x0c\x02\x0e"]
delete mvcc:TxnWrite(11, sql:Index(unique.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(11, sql:Index(unique.float, NaN)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(11, sql:Index(unique.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(11, sql:Index(unique.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(11, sql:Row(unique, 7)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00"]
delete mvcc:TxnActive(11) ["\x01\x00\x00\x00\x00\x00\x00\x00\x0b"]

# Float 0.0 and -0.0 are considered equal.
[ops]> INSERT INTO "unique" (id, "float") VALUES (8, -0.0)
---
set mvcc:NextVersion → 13 ["\x00" → "\r"]
set mvcc:TxnActive(12) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x0c" → ""]
set mvcc:TxnWrite(12, sql:Row(unique, 8)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0c\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x08\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 8), 12) → 8,NULL,NULL,0.0,NULL ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x08\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c" → "\x01\x0f\x05\x02\x10\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"]
set mvcc:TxnWrite(12, sql:Index(unique.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0c\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, NULL), 12) → 4,5,6,7,8 ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c" → "\x01\x0b\x05\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10"]
set mvcc:TxnWrite(12, sql:Index(unique.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0c\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, NULL), 12) → 4,5,6,7,8 ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c" → "\x01\x0b\x05\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10"]
set mvcc:TxnWrite(12, sql:Index(unique.float, 0.0)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0c\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, 0.0), 12) → 8 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c" → "\x01\x03\x01\x02\x10"]
set mvcc:TxnWrite(12, sql:Index(unique.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0c\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, NULL), 12) → 4,5,6,7,8 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c" → "\x01\x0b\x05\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10"]
delete mvcc:TxnWrite(12, sql:Index(unique.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0c\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(12, sql:Index(unique.float, 0.0)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0c\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(12, sql:Index(unique.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0c\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(12, sql:Index(unique.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0c\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(12, sql:Row(unique, 8)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0c\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x08\x00\x00"]
delete mvcc:TxnActive(12) ["\x01\x00\x00\x00\x00\x00\x00\x00\x0c"]

!> INSERT INTO "unique" (id, "float") VALUES (9, 0.0)
---
Error: invalid input: value 0.0 already in unique column float

# Float INFINITY is also unique.
[ops]> INSERT INTO "unique" (id, "float") VALUES (10, INFINITY)
---
set mvcc:NextVersion → 15 ["\x00" → "\x0f"]
set mvcc:TxnActive(14) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x0e" → ""]
set mvcc:TxnWrite(14, sql:Row(unique, 10)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\n\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 10), 14) → 10,NULL,NULL,inf,NULL ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\n\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x01\x0f\x05\x02\x14\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf0\x7f\x00"]
set mvcc:TxnWrite(14, sql:Index(unique.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, NULL), 14) → 4,5,6,7,8,10 ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x01\r\x06\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x14"]
set mvcc:TxnWrite(14, sql:Index(unique.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, NULL), 14) → 4,5,6,7,8,10 ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x01\r\x06\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x14"]
set mvcc:TxnWrite(14, sql:Index(unique.float, inf)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf0\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, inf), 14) → 10 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf0\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x01\x03\x01\x02\x14"]
set mvcc:TxnWrite(14, sql:Index(unique.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, NULL), 14) → 4,5,6,7,8,10 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x01\r\x06\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x14"]
delete mvcc:TxnWrite(14, sql:Index(unique.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(14, sql:Index(unique.float, inf)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf0\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(14, sql:Index(unique.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(14, sql:Index(unique.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(14, sql:Row(unique, 10)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\n\x00\x00"]
delete mvcc:TxnActive(14) ["\x01\x00\x00\x00\x00\x00\x00\x00\x0e"]

!> INSERT INTO "unique" (id, "float") VALUES (11, INFINITY)
---
Error: invalid input: value inf already in unique column float

# Empty strings are considered equal.
[ops]> INSERT INTO "unique" (id, "string") VALUES (11, '')
---
set mvcc:NextVersion → 17 ["\x00" → "\x11"]
set mvcc:TxnActive(16) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x10" → ""]
set mvcc:TxnWrite(16, sql:Row(unique, 11)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x10\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0b\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 11), 16) → 11,NULL,NULL,NULL,'' ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10" → "\x01\x08\x05\x02\x16\x00\x00\x00\x04\x00"]
set mvcc:TxnWrite(16, sql:Index(unique.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x10\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, NULL), 16) → 4,5,6,7,8,10,11 ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10" → "\x01\x0f\x07\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x14\x02\x16"]
set mvcc:TxnWrite(16, sql:Index(unique.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x10\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, NULL), 16) → 4,5,6,7,8,10,11 ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10" → "\x01\x0f\x07\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x14\x02\x16"]
set mvcc:TxnWrite(16, sql:Index(unique.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x10\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NULL), 16) → 4,5,11 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10" → "\x01\x07\x03\x02\x08\x02\n\x02\x16"]
set mvcc:TxnWrite(16, sql:Index(unique.string, '')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x10\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, ''), 16) → 11 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x10" → "\x01\x03\x01\x02\x16"]
delete mvcc:TxnWrite(16, sql:Index(unique.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x10\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(16, sql:Index(unique.float, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x10\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(16, sql:Index(unique.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x10\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(16, sql:Index(unique.string, '')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x10\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(16, sql:Row(unique, 11)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x10\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0b\x00\x00"]
delete mvcc:TxnActive(16) ["\x01\x00\x00\x00\x00\x00\x00\x00\x10"]

!> INSERT INTO "unique" (id, "string") VALUES (12, '')
---
Error: invalid input: value '' already in unique column string

# Case differences are not considered equal.
[ops]> INSERT INTO "unique" (id, "string") VALUES (12, 'case')
---
set mvcc:NextVersion → 19 ["\x00" → "\x13"]
set mvcc:TxnActive(18) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x12" → ""]
set mvcc:TxnWrite(18, sql:Row(unique, 12)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0c\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 12), 18) → 12,NULL,NULL,NULL,'case' ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x12" → "\x01\x0c\x05\x02\x18\x00\x00\x00\x04\x04case"]
set mvcc:TxnWrite(18, sql:Index(unique.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, NULL), 18) → 4,5,6,7,8,10,11,12 ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x12" → "\x01\x11\x08\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x14\x02\x16\x02\x18"]
set mvcc:TxnWrite(18, sql:Index(unique.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, NULL), 18) → 4,5,6,7,8,10,11,12 ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x12" → "\x01\x11\x08\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x14\x02\x16\x02\x18"]
set mvcc:TxnWrite(18, sql:Index(unique.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NULL), 18) → 4,5,11,12 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x12" → "\x01\t\x04\x02\x08\x02\n\x02\x16\x02\x18"]
set mvcc:TxnWrite(18, sql:Index(unique.string, 'case')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04case\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, 'case'), 18) → 12 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04case\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x12" → "\x01\x03\x01\x02\x18"]
delete mvcc:TxnWrite(18, sql:Index(unique.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(18, sql:Index(unique.float, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(18, sql:Index(unique.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(18, sql:Index(unique.string, 'case')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04case\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(18, sql:Row(unique, 12)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x0c\x00\x00"]
delete mvcc:TxnActive(18) ["\x01\x00\x00\x00\x00\x00\x00\x00\x12"]

[ops]> INSERT INTO "unique" (id, "string") VALUES (13, 'CaSe')
---
set mvcc:NextVersion → 20 ["\x00" → "\x14"]
set mvcc:TxnActive(19) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x13" → ""]
set mvcc:TxnWrite(19, sql:Row(unique, 13)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x13\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\r\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 13), 19) → 13,NULL,NULL,NULL,'CaSe' ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\r\x00\x00\x00\x00\x00\x00\x00\x00\x00\x13" → "\x01\x0c\x05\x02\x1a\x00\x00\x00\x04\x04CaSe"]
set mvcc:TxnWrite(19, sql:Index(unique.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x13\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, NULL), 19) → 4,5,6,7,8,10,11,12,13 ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x13" → "\x01\x13\t\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x14\x02\x16\x02\x18\x02\x1a"]
set mvcc:TxnWrite(19, sql:Index(unique.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x13\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, NULL), 19) → 4,5,6,7,8,10,11,12,13 ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x13" → "\x01\x13\t\x02\x08\x02\n\x02\x0c\x02\x0e\x02\x10\x02\x14\x02\x16\x02\x18\x02\x1a"]
set mvcc:TxnWrite(19, sql:Index(unique.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x13\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NULL), 19) → 4,5,11,12,13 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x13" → "\x01\x0b\x05\x02\x08\x02\n\x02\x16\x02\x18\x02\x1a"]
set mvcc:TxnWrite(19, sql:Index(unique.string, 'CaSe')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x13\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04CaSe\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, 'CaSe'), 19) → 13 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04CaSe\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x13" → "\x01\x03\x01\x02\x1a"]
delete mvcc:TxnWrite(19, sql:Index(unique.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x13\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(19, sql:Index(unique.float, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x13\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(19, sql:Index(unique.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x13\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(19, sql:Index(unique.string, 'CaSe')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x13\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04CaSe\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(19, sql:Row(unique, 13)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x13\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\r\x00\x00"]
delete mvcc:TxnActive(19) ["\x01\x00\x00\x00\x00\x00\x00\x00\x13"]


================================================
FILE: src/sql/testscripts/writes/update
================================================
# Tests basic UPDATE functionality.

> CREATE TABLE name (id INT PRIMARY KEY, value STRING)
> INSERT INTO name VALUES (1, 'a'), (2, 'b')
---
ok

# UPDATE updates rows, and returns the number of rows.
[plan,result,ops]> UPDATE name SET value = 'foo'
---
Update: name (value='foo')
└─ Scan: name
set mvcc:NextVersion → 4 ["\x00" → "\x04"]
set mvcc:TxnActive(3) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x03" → ""]
set mvcc:TxnWrite(3, sql:Row(name, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 1), 3) → 1,'foo' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x08\x02\x02\x02\x04\x03foo"]
set mvcc:TxnWrite(3, sql:Row(name, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 2), 3) → 2,'foo' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x08\x02\x02\x04\x04\x03foo"]
delete mvcc:TxnWrite(3, sql:Row(name, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(3, sql:Row(name, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(3) ["\x01\x00\x00\x00\x00\x00\x00\x00\x03"]
Update { count: 2 }

> SELECT * FROM name
---
1, 'foo'
2, 'foo'

dump
---
mvcc:NextVersion → 4 ["\x00" → "\x04"]
mvcc:Version(sql:Table(name), 1) → CREATE TABLE name ( id INTEGER PRIMARY KEY, value STRING DEFAULT NULL ) ["\x04\x00\xffname\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x1d\x04name\x00\x02\x02id\x01\x00\x00\x01\x00\x00\x05value\x03\x01\x01\x00\x00\x00\x00"]
mvcc:Version(sql:Row(name, 1), 2) → 1,'a' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x02\x04\x01a"]
mvcc:Version(sql:Row(name, 1), 3) → 1,'foo' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x08\x02\x02\x02\x04\x03foo"]
mvcc:Version(sql:Row(name, 2), 2) → 2,'b' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x06\x02\x02\x04\x04\x01b"]
mvcc:Version(sql:Row(name, 2), 3) → 2,'foo' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x08\x02\x02\x04\x04\x03foo"]

# Bare UPDATE errors.
!> UPDATE
!> UPDATE name
!> UPDATE name SET
!> UPDATE name SET value
---
Error: invalid input: unexpected end of input
Error: invalid input: unexpected end of input
Error: invalid input: unexpected end of input
Error: invalid input: unexpected end of input

# Unknown table or column errors.
!> UPDATE foo SET value = 'bar'
!> UPDATE name SET foo = 'bar'
---
Error: invalid input: table foo does not exist
Error: invalid input: unknown column foo

# Specifying the same column multiple times errors.
!> UPDATE name SET value = 'e', value = 'f'
---
Error: invalid input: column value set multiple times


================================================
FILE: src/sql/testscripts/writes/update_datatypes
================================================
# Tests UPDATE of all datatypes.

# Create columns and a row with all datatypes.
> CREATE TABLE datatypes ( \
    id INTEGER PRIMARY KEY, \
    "bool" BOOL, \
    "int" INT, \
    "float" FLOAT, \
    "string" STRING \
)
> INSERT INTO datatypes VALUES (1)
---
ok

# Booleans.
> UPDATE datatypes SET "bool" = NULL
> UPDATE datatypes SET "bool" = FALSE
> UPDATE datatypes SET "bool" = TRUE
> SELECT * FROM datatypes
---
1, TRUE, NULL, NULL, NULL

!> UPDATE datatypes SET "bool" = 1
!> UPDATE datatypes SET "bool" = 3.14
!> UPDATE datatypes SET "bool" = 'false'
---
Error: invalid input: invalid datatype INTEGER for BOOLEAN column bool
Error: invalid input: invalid datatype FLOAT for BOOLEAN column bool
Error: invalid input: invalid datatype STRING for BOOLEAN column bool

# Integers.
> UPDATE datatypes SET "int" = NULL
> UPDATE datatypes SET "int" = 1
> UPDATE datatypes SET "int" = 0
> UPDATE datatypes SET "int" = -1
> UPDATE datatypes SET "int" = 9223372036854775807
> UPDATE datatypes SET "int" = -9223372036854775807
> SELECT * FROM datatypes
---
1, TRUE, -9223372036854775807, NULL, NULL

!> UPDATE datatypes SET "int" = false
!> UPDATE datatypes SET "int" = 3.0
!> UPDATE datatypes SET "int" = '0'
---
Error: invalid input: invalid datatype BOOLEAN for INTEGER column int
Error: invalid input: invalid datatype FLOAT for INTEGER column int
Error: invalid input: invalid datatype STRING for INTEGER column int

# Floats.
> UPDATE datatypes SET "float" = NULL
> UPDATE datatypes SET "float" = 3.14
> UPDATE datatypes SET "float" = -3.14
> UPDATE datatypes SET "float" = 1.23456789012345e308
> UPDATE datatypes SET "float" = -1.23456789012345e308
> UPDATE datatypes SET "float" = INFINITY
> UPDATE datatypes SET "float" = -INFINITY
> SELECT * FROM datatypes
---
1, TRUE, -9223372036854775807, -inf, NULL

> UPDATE datatypes SET "float" = NAN
> SELECT "float" FROM datatypes
> UPDATE datatypes SET "float" = -NAN
> SELECT "float" FROM datatypes
> UPDATE datatypes SET "float" = 0.0
> SELECT "float" FROM datatypes
> UPDATE datatypes SET "float" = -0.0
> SELECT "float" FROM datatypes
---
NaN
NaN
0.0
0.0

# Strings.
> UPDATE datatypes SET "string" = NULL
> UPDATE datatypes SET "string" = ''
> UPDATE datatypes SET "string" = '  '
> UPDATE datatypes SET "string" = 'abc'
> UPDATE datatypes SET "string" = 'Hi! 👋'
> SELECT * FROM datatypes
---
1, TRUE, -9223372036854775807, 0.0, 'Hi! 👋'

!> UPDATE datatypes SET "string" = false
!> UPDATE datatypes SET "string" = 3
!> UPDATE datatypes SET "string" = 3.14
---
Error: invalid input: invalid datatype BOOLEAN for STRING column string
Error: invalid input: invalid datatype INTEGER for STRING column string
Error: invalid input: invalid datatype FLOAT for STRING column string


================================================
FILE: src/sql/testscripts/writes/update_default
================================================
# UPDATE can set default values.

> CREATE TABLE defaults ( \
    id INTEGER PRIMARY KEY, \
    required BOOLEAN NOT NULL, \
    "null" BOOLEAN, \
    "boolean" BOOLEAN DEFAULT TRUE, \
    "float" FLOAT DEFAULT 3.14, \
    "integer" INTEGER DEFAULT 7, \
    "string" STRING DEFAULT 'foo' \
)
> INSERT INTO defaults VALUES (1, true, NULL, NULL, NULL, NULL, NULL)
---
ok

> UPDATE defaults SET "null" = DEFAULT, "boolean" = DEFAULT, "float" = DEFAULT, "integer" = DEFAULT, "string" = DEFAULT
> SELECT * FROM defaults
---
1, TRUE, NULL, TRUE, 3.14, 7, 'foo'

# Errors on columns with no default.
!> UPDATE defaults SET required = DEFAULT
---
Error: invalid input: column required has no default value


================================================
FILE: src/sql/testscripts/writes/update_expression
================================================
# Tests UPDATE expression evaluation.

> CREATE TABLE test (id INT PRIMARY KEY, value INT, quantity INT NOT NULL)
> INSERT INTO test VALUES (0, NULL, 0), (1, 1, 0), (2, 2, 0)
---
ok

# UPDATE can evaluate constant expressions.
> UPDATE test SET value = 2 * 2 + 3
> SELECT * FROM test
---
0, 7, 0
1, 7, 0
2, 7, 0

# UPDATE can evaluate variable expressions.
> UPDATE test SET value = id + 10 - quantity
> SELECT * FROM test
---
0, 10, 0
1, 11, 0
2, 12, 0

# UPDATE evaluation uses the old values.
> UPDATE test SET value = id + 1, quantity = value
> SELECT * FROM test
---
0, 1, 10
1, 2, 11
2, 3, 12

# This is also true with primary key updates.
> UPDATE test SET id = id - 1, value = id, quantity = value
> SELECT * FROM test
---
-1, 0, 1
0, 1, 2
1, 2, 3

# UPDATE expressions respect constraints.
> UPDATE test SET value = NULL WHERE id = 0
> SELECT * FROM test
!> UPDATE test SET quantity = value
---
-1, 0, 1
0, NULL, 2
1, 2, 3
Error: invalid input: NULL value not allowed for column quantity


================================================
FILE: src/sql/testscripts/writes/update_index
================================================
# Tests UPDATE index writes.

> CREATE TABLE "index" ( \
    id INT PRIMARY KEY, \
    "bool" BOOL INDEX, \
    "int" INT INDEX, \
    "float" FLOAT INDEX, \
    "string" STRING INDEX \
)
> INSERT INTO "index" VALUES (1, TRUE, 7, 3.14, 'foo')
---
ok

# An UPDATE writes to all indexes.
[ops]> UPDATE "index" SET "bool" = FALSE, "int" = 1, "float" = 2.718, "string" = 'bar'
---
set mvcc:NextVersion → 4 ["\x00" → "\x04"]
set mvcc:TxnActive(3) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x03" → ""]
set mvcc:TxnWrite(3, sql:Index(index.bool, TRUE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, TRUE), 3) → None ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
set mvcc:TxnWrite(3, sql:Index(index.bool, FALSE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, FALSE), 3) → 1 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(3, sql:Index(index.int, 7)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, 7), 3) → None ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
set mvcc:TxnWrite(3, sql:Index(index.int, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, 1), 3) → 1 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(3, sql:Index(index.float, 3.14)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, 3.14), 3) → None ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
set mvcc:TxnWrite(3, sql:Index(index.float, 2.718)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, 2.718), 3) → 1 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(3, sql:Index(index.string, 'foo')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, 'foo'), 3) → None ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
set mvcc:TxnWrite(3, sql:Index(index.string, 'bar')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04bar\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, 'bar'), 3) → 1 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04bar\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(3, sql:Row(index, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 1), 3) → 1,FALSE,1,2.718,'bar' ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x15\x05\x02\x02\x01\x00\x02\x02\x03X9\xb4\xc8v\xbe\x05@\x04\x03bar"]
delete mvcc:TxnWrite(3, sql:Index(index.bool, FALSE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00"]
delete mvcc:TxnWrite(3, sql:Index(index.bool, TRUE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00"]
delete mvcc:TxnWrite(3, sql:Index(index.float, 2.718)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00"]
delete mvcc:TxnWrite(3, sql:Index(index.float, 3.14)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00"]
delete mvcc:TxnWrite(3, sql:Index(index.int, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(3, sql:Index(index.int, 7)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00"]
delete mvcc:TxnWrite(3, sql:Index(index.string, 'bar')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04bar\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(3, sql:Index(index.string, 'foo')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(3, sql:Row(index, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnActive(3) ["\x01\x00\x00\x00\x00\x00\x00\x00\x03"]

# A single-column update only updates the relevant index.
[ops]> UPDATE "index" SET "bool" = TRUE
---
set mvcc:NextVersion → 5 ["\x00" → "\x05"]
set mvcc:TxnActive(4) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x04" → ""]
set mvcc:TxnWrite(4, sql:Index(index.bool, FALSE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, FALSE), 4) → None ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x00"]
set mvcc:TxnWrite(4, sql:Index(index.bool, TRUE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, TRUE), 4) → 1 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(4, sql:Row(index, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 1), 4) → 1,TRUE,1,2.718,'bar' ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x15\x05\x02\x02\x01\x01\x02\x02\x03X9\xb4\xc8v\xbe\x05@\x04\x03bar"]
delete mvcc:TxnWrite(4, sql:Index(index.bool, FALSE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00"]
delete mvcc:TxnWrite(4, sql:Index(index.bool, TRUE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00"]
delete mvcc:TxnWrite(4, sql:Row(index, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnActive(4) ["\x01\x00\x00\x00\x00\x00\x00\x00\x04"]

# An update with different values writes new index entries.
> INSERT INTO "index" VALUES (2, NULL, NULL, NULL, NULL)
[ops]> UPDATE "index" SET "bool" = FALSE, "int" = 7, "float" = 3.14, "string" = 'abc' WHERE id = 2
---
set mvcc:NextVersion → 7 ["\x00" → "\x07"]
set mvcc:TxnActive(6) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x06" → ""]
set mvcc:TxnWrite(6, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 6) → None ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x00"]
set mvcc:TxnWrite(6, sql:Index(index.bool, FALSE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, FALSE), 6) → 2 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(6, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 6) → None ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x00"]
set mvcc:TxnWrite(6, sql:Index(index.int, 7)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, 7), 6) → 2 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(6, sql:Index(index.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, NULL), 6) → None ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x00"]
set mvcc:TxnWrite(6, sql:Index(index.float, 3.14)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, 3.14), 6) → 2 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(6, sql:Index(index.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, NULL), 6) → None ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x00"]
set mvcc:TxnWrite(6, sql:Index(index.string, 'abc')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04abc\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, 'abc'), 6) → 2 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04abc\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(6, sql:Row(index, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 2), 6) → 2,FALSE,7,3.14,'abc' ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06" → "\x01\x15\x05\x02\x04\x01\x00\x02\x0e\x03\x1f\x85\xebQ\xb8\x1e\t@\x04\x03abc"]
delete mvcc:TxnWrite(6, sql:Index(index.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(6, sql:Index(index.bool, FALSE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00"]
delete mvcc:TxnWrite(6, sql:Index(index.float, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(6, sql:Index(index.float, 3.14)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00"]
delete mvcc:TxnWrite(6, sql:Index(index.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(6, sql:Index(index.int, 7)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00"]
delete mvcc:TxnWrite(6, sql:Index(index.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(6, sql:Index(index.string, 'abc')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04abc\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(6, sql:Row(index, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x06\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(6) ["\x01\x00\x00\x00\x00\x00\x00\x00\x06"]

> SELECT * FROM "index"
---
1, TRUE, 1, 2.718, 'bar'
2, FALSE, 7, 3.14, 'abc'

# Updates with same values merges the index entries.
[ops]> UPDATE "index" SET "bool" = TRUE, "int" = 7, "float" = 3.14, "string" = 'foo'
---
set mvcc:NextVersion → 8 ["\x00" → "\x08"]
set mvcc:TxnActive(7) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x07" → ""]
set mvcc:TxnWrite(7, sql:Index(index.int, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, 1), 7) → None ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x00"]
set mvcc:TxnWrite(7, sql:Index(index.int, 7)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, 7), 7) → 1,2 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(7, sql:Index(index.float, 2.718)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, 2.718), 7) → None ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x00"]
set mvcc:TxnWrite(7, sql:Index(index.float, 3.14)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, 3.14), 7) → 1,2 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(7, sql:Index(index.string, 'bar')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04bar\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, 'bar'), 7) → None ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04bar\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x00"]
set mvcc:TxnWrite(7, sql:Index(index.string, 'foo')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, 'foo'), 7) → 1 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(7, sql:Row(index, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 1), 7) → 1,TRUE,7,3.14,'foo' ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x15\x05\x02\x02\x01\x01\x02\x0e\x03\x1f\x85\xebQ\xb8\x1e\t@\x04\x03foo"]
set mvcc:TxnWrite(7, sql:Index(index.bool, FALSE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, FALSE), 7) → None ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x00"]
set mvcc:TxnWrite(7, sql:Index(index.bool, TRUE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, TRUE), 7) → 1,2 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(7, sql:Index(index.string, 'abc')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04abc\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, 'abc'), 7) → None ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04abc\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x00"]
set mvcc:TxnWrite(7, sql:Index(index.string, 'foo')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, 'foo'), 7) → 1,2 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(7, sql:Row(index, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 2), 7) → 2,TRUE,7,3.14,'foo' ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07" → "\x01\x15\x05\x02\x04\x01\x01\x02\x0e\x03\x1f\x85\xebQ\xb8\x1e\t@\x04\x03foo"]
delete mvcc:TxnWrite(7, sql:Index(index.bool, FALSE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00"]
delete mvcc:TxnWrite(7, sql:Index(index.bool, TRUE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00"]
delete mvcc:TxnWrite(7, sql:Index(index.float, 2.718)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00"]
delete mvcc:TxnWrite(7, sql:Index(index.float, 3.14)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00"]
delete mvcc:TxnWrite(7, sql:Index(index.int, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(7, sql:Index(index.int, 7)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00"]
delete mvcc:TxnWrite(7, sql:Index(index.string, 'abc')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04abc\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(7, sql:Index(index.string, 'bar')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04bar\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(7, sql:Index(index.string, 'foo')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(7, sql:Row(index, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(7, sql:Row(index, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x07\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(7) ["\x01\x00\x00\x00\x00\x00\x00\x00\x07"]

> SELECT * FROM "index"
---
1, TRUE, 7, 3.14, 'foo'
2, TRUE, 7, 3.14, 'foo'

# Updates with all NULLs work and get indexed.
[ops]> UPDATE "index" SET "bool" = NULL, "int" = NULL, "float" = NULL, "string" = NULL
---
set mvcc:NextVersion → 9 ["\x00" → "\t"]
set mvcc:TxnActive(8) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x08" → ""]
set mvcc:TxnWrite(8, sql:Index(index.bool, TRUE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, TRUE), 8) → 2 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(8, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 8) → 1 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(8, sql:Index(index.int, 7)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, 7), 8) → 2 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(8, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 8) → 1 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(8, sql:Index(index.float, 3.14)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, 3.14), 8) → 2 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(8, sql:Index(index.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, NULL), 8) → 1 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(8, sql:Index(index.string, 'foo')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, 'foo'), 8) → 2 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(8, sql:Index(index.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, NULL), 8) → 1 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(8, sql:Row(index, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 1), 8) → 1,NULL,NULL,NULL,NULL ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x07\x05\x02\x02\x00\x00\x00\x00"]
set mvcc:TxnWrite(8, sql:Index(index.bool, TRUE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, TRUE), 8) → None ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Index(index.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.bool, NULL), 8) → 1,2 ["\x04\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(8, sql:Index(index.int, 7)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, 7), 8) → None ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Index(index.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.int, NULL), 8) → 1,2 ["\x04\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(8, sql:Index(index.float, 3.14)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, 3.14), 8) → None ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Index(index.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.float, NULL), 8) → 1,2 ["\x04\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(8, sql:Index(index.string, 'foo')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, 'foo'), 8) → None ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x00"]
set mvcc:TxnWrite(8, sql:Index(index.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(index.string, NULL), 8) → 1,2 ["\x04\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(8, sql:Row(index, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(index, 2), 8) → 2,NULL,NULL,NULL,NULL ["\x04\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x08" → "\x01\x07\x05\x02\x04\x00\x00\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(index.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(index.bool, TRUE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(index.float, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(index.float, 3.14)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(index.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(index.int, 7)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x07\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(index.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Index(index.string, 'foo')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x01index\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(8, sql:Row(index, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(8, sql:Row(index, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x08\x02index\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(8) ["\x01\x00\x00\x00\x00\x00\x00\x00\x08"]

> SELECT * FROM "index"
---
1, NULL, NULL, NULL, NULL
2, NULL, NULL, NULL, NULL


================================================
FILE: src/sql/testscripts/writes/update_null
================================================
# Tests nullability handling of UPSERT.

# Create a table with NULL constraints and a row.
> CREATE TABLE name ( \
    id INT PRIMARY KEY, \
    "null" STRING NULL, \
    not_null STRING NOT NULL \
)
> INSERT INTO name VALUES (1, 'foo', 'bar')
---
ok

# UPDATE with NULL works.
> UPDATE name SET "null" = NULL
> SELECT * FROM name
---
1, NULL, 'bar'

# UPDATE with NULL in non-NULL column errors.
!> UPDATE name SET id = NULL
!> UPDATE name SET non_null = NULL
---
Error: invalid input: invalid primary key NULL
Error: invalid input: unknown column non_null


================================================
FILE: src/sql/testscripts/writes/update_primary_key
================================================
# Tests UPDATE primary key handling.

# Boolean.
> CREATE TABLE "bool" (id BOOLEAN PRIMARY KEY)
> INSERT INTO "bool" VALUES (TRUE)
> UPDATE "bool" SET id = FALSE
> SELECT * FROM "bool"
---
FALSE

> INSERT INTO "bool" VALUES (TRUE)
!> UPDATE "bool" SET id = FALSE
!> UPDATE "bool" SET id = FALSE WHERE id = TRUE
---
Error: invalid input: primary key FALSE already exists
Error: invalid input: primary key FALSE already exists

# Integer.
> CREATE TABLE "int" (id INT PRIMARY KEY)
> INSERT INTO "int" VALUES (0)
> UPDATE "int" SET id = 1
> SELECT * FROM "int"
> UPDATE "int" SET id = -1
> SELECT * FROM "int"
> UPDATE "int" SET id = 9223372036854775807
> SELECT * FROM "int"
> UPDATE "int" SET id = -9223372036854775807
> SELECT * FROM "int"
---
1
-1
9223372036854775807
-9223372036854775807

> INSERT INTO "int" VALUES (0)
> UPDATE "int" SET id = 1 WHERE id = -9223372036854775807
> SELECT * FROM "int"
---
0
1

!> UPDATE "int" SET id = 1
!> UPDATE "int" SET id = 2
---
Error: invalid input: primary key 1 already exists
Error: invalid input: primary key 2 already exists

# Float.
> CREATE TABLE "float" (id FLOAT PRIMARY KEY)
> INSERT INTO "float" VALUES (0.0)
> UPDATE "float" SET id = 3.14
> SELECT * FROM "float"
> UPDATE "float" SET id = -3.14
> SELECT * FROM "float"
> UPDATE "float" SET id = 0.0
> SELECT * FROM "float"
> UPDATE "float" SET id = -0.0
> SELECT * FROM "float"
> UPDATE "float" SET id = 1.23456789012345e308
> SELECT * FROM "float"
> UPDATE "float" SET id = -1.23456789012345e308
> SELECT * FROM "float"
> UPDATE "float" SET id = INFINITY
> SELECT * FROM "float"
> UPDATE "float" SET id = -INFINITY
> SELECT * FROM "float"
---
3.14
-3.14
0.0
0.0
1.23456789012345e308
-1.23456789012345e308
inf
-inf

> INSERT INTO "float" VALUES (1.0)
> UPDATE "float" SET id = 0.0 WHERE id = 1.0
> SELECT * FROM "float"
---
-inf
0.0

!> UPDATE "float" SET id = 3.14
!> UPDATE "float" SET id = -3.14
!> UPDATE "float" SET id = 0.0
!> UPDATE "float" SET id = -0.0
!> UPDATE "float" SET id = 1.23456789012345e308
!> UPDATE "float" SET id = -1.23456789012345e308
!> UPDATE "float" SET id = INFINITY
!> UPDATE "float" SET id = -INFINITY
!> UPDATE "float" SET id = NAN
!> UPDATE "float" SET id = NULL
---
Error: invalid input: primary key 3.14 already exists
Error: invalid input: primary key -3.14 already exists
Error: invalid input: primary key 0.0 already exists
Error: invalid input: primary key -0.0 already exists
Error: invalid input: primary key 1.23456789012345e308 already exists
Error: invalid input: primary key -1.23456789012345e308 already exists
Error: invalid input: primary key inf already exists
Error: invalid input: primary key -inf already exists
Error: invalid input: invalid primary key NaN
Error: invalid input: invalid primary key NULL

# String.
> CREATE TABLE "string" (id STRING PRIMARY KEY)
> INSERT INTO "string" VALUES ('')
> UPDATE "string" SET id = ''
> UPDATE "string" SET id = '  '
> UPDATE "string" SET id = 'abc'
> UPDATE "string" SET id = 'ABC'
> UPDATE "string" SET id = 'Hi! 👋'
> SELECT * FROM "string"
---
'Hi! 👋'

> INSERT INTO "string" VALUES ('')
> UPDATE "string" SET id = 'foo' WHERE id = ''
> SELECT * FROM "string"
---
'Hi! 👋'
'foo'

!> UPDATE "string" SET id = ''
!> UPDATE "string" SET id = '  '
!> UPDATE "string" SET id = 'abc'
!> UPDATE "string" SET id = 'ABC'
!> UPDATE "string" SET id = 'Hi! 👋'
!> UPDATE "string" SET id = NULL
---
Error: invalid input: primary key '' already exists
Error: invalid input: primary key '  ' already exists
Error: invalid input: primary key 'abc' already exists
Error: invalid input: primary key 'ABC' already exists
Error: invalid input: primary key 'Hi! 👋' already exists
Error: invalid input: invalid primary key NULL

# Primary key updates error if intermediate row updates violate primary key
# uniqueness, even if the final state wouldn't violate the constraints. This is
# also true with Postgres.
> SELECT * FROM "int"
---
0
1

!> UPDATE "int" SET id = id + 1
---
Error: invalid input: primary key 1 already exists

# The updates happen in primary key order, so the reverse update does work.
> UPDATE "int" SET id = id - 1
> SELECT * FROM "int"
---
-1
0


================================================
FILE: src/sql/testscripts/writes/update_reference
================================================
# Tests UPDATE foreign key references.

# Create reference tables for all datatypes.
> CREATE TABLE "bool" (id BOOL PRIMARY KEY)
> INSERT INTO "bool" VALUES (true)

> CREATE TABLE "int" (id INT PRIMARY KEY)
> INSERT INTO "int" VALUES (-1), (0), (1)

> CREATE TABLE "float" (id FLOAT PRIMARY KEY)
> INSERT INTO "float" VALUES (3.14), (0.0), (INFINITY)

> CREATE TABLE "string" (id STRING PRIMARY KEY)
> INSERT INTO "string" VALUES (''), ('foo')

> CREATE TABLE name ( \
    id INT PRIMARY KEY, \
    "bool" BOOL REFERENCES "bool", \
    "int" INT REFERENCES "int", \
    "float" FLOAT REFERENCES "float", \
    "string" STRING REFERENCES "string" \
)
> INSERT INTO name VALUES (1, NULL, NULL, NULL, NULL)
---
ok

# UPDATEs with existing references work, and update the index entries.
[ops]> UPDATE name SET "bool" = TRUE, "int" = 1, "float" = 3.14, "string" = 'foo'
---
set mvcc:NextVersion → 12 ["\x00" → "\x0c"]
set mvcc:TxnActive(11) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x0b" → ""]
set mvcc:TxnWrite(11, sql:Index(name.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.bool, NULL), 11) → None ["\x04\x01name\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x00"]
set mvcc:TxnWrite(11, sql:Index(name.bool, TRUE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(name.bool, TRUE), 11) → 1 ["\x04\x01name\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(11, sql:Index(name.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.int, NULL), 11) → None ["\x04\x01name\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x00"]
set mvcc:TxnWrite(11, sql:Index(name.int, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(name.int, 1), 11) → 1 ["\x04\x01name\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(11, sql:Index(name.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.float, NULL), 11) → None ["\x04\x01name\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x00"]
set mvcc:TxnWrite(11, sql:Index(name.float, 3.14)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00" → ""]
set mvcc:Version(sql:Index(name.float, 3.14), 11) → 1 ["\x04\x01name\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(11, sql:Index(name.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.string, NULL), 11) → None ["\x04\x01name\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x00"]
set mvcc:TxnWrite(11, sql:Index(name.string, 'foo')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(name.string, 'foo'), 11) → 1 ["\x04\x01name\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(11, sql:Row(name, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(name, 1), 11) → 1,TRUE,1,3.14,'foo' ["\x04\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0b" → "\x01\x15\x05\x02\x02\x01\x01\x02\x02\x03\x1f\x85\xebQ\xb8\x1e\t@\x04\x03foo"]
delete mvcc:TxnWrite(11, sql:Index(name.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(11, sql:Index(name.bool, TRUE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00"]
delete mvcc:TxnWrite(11, sql:Index(name.float, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(11, sql:Index(name.float, 3.14)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00"]
delete mvcc:TxnWrite(11, sql:Index(name.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(11, sql:Index(name.int, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(11, sql:Index(name.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(11, sql:Index(name.string, 'foo')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x01name\x00\xff\x00\xffstring\x00\xff\x00\xff\x04foo\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(11, sql:Row(name, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0b\x02name\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnActive(11) ["\x01\x00\x00\x00\x00\x00\x00\x00\x0b"]

# UPDATEs error on missing references.
!> UPDATE name SET "bool" = FALSE
!> UPDATE name SET "int" = 7
!> UPDATE name SET "float" = 2.718
!> UPDATE name SET "string" = 'bar'
---
Error: invalid input: reference FALSE not in table bool
Error: invalid input: reference 7 not in table int
Error: invalid input: reference 2.718 not in table float
Error: invalid input: reference 'bar' not in table string

# -0.0 equals 0.0.
> UPDATE name SET "float" = -0.0
---
ok

# NaN is not valid as a missing reference marker.
!> UPDATE name SET "float" = NAN
---
Error: invalid input: reference NaN not in table float

# INFINITY is also valid.
> UPDATE name SET "float" = INFINITY
---
ok

# References are case sensitive.
!> UPDATE name SET "string" = 'FOO'
---
Error: invalid input: reference 'FOO' not in table string

# Empty strings are valid references.
> UPDATE name SET "string" = ''
---
ok

# NULLs are valid.
> UPDATE name SET "bool" = NULL, "int" = NULL, "float" = NULL, "string" = NULL
---
ok

> SELECT * FROM name
---
1, NULL, NULL, NULL, NULL

# Self references are fine.
> CREATE TABLE self (id INT PRIMARY KEY, self_id INT REFERENCES self)
> INSERT INTO self VALUES (1, NULL), (2, NULL), (3, NULL)
---
ok

[ops]> UPDATE self SET self_id = 1 WHERE id = 1
---
set mvcc:NextVersion → 25 ["\x00" → "\x19"]
set mvcc:TxnActive(24) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x18" → ""]
set mvcc:TxnWrite(24, sql:Index(self.self_id, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x18\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(self.self_id, NULL), 24) → 2,3 ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18" → "\x01\x05\x02\x02\x04\x02\x06"]
set mvcc:TxnWrite(24, sql:Index(self.self_id, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x18\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(self.self_id, 1), 24) → 1 ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(24, sql:Row(self, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x18\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(self, 1), 24) → 1,1 ["\x04\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x18" → "\x01\x05\x02\x02\x02\x02\x02"]
delete mvcc:TxnWrite(24, sql:Index(self.self_id, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x18\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(24, sql:Index(self.self_id, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x18\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(24, sql:Row(self, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x18\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnActive(24) ["\x01\x00\x00\x00\x00\x00\x00\x00\x18"]

[ops]> UPDATE self SET self_id = 1 WHERE id = 2
---
set mvcc:NextVersion → 26 ["\x00" → "\x1a"]
set mvcc:TxnActive(25) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x19" → ""]
set mvcc:TxnWrite(25, sql:Index(self.self_id, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x19\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(self.self_id, NULL), 25) → 3 ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x19" → "\x01\x03\x01\x02\x06"]
set mvcc:TxnWrite(25, sql:Index(self.self_id, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x19\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(self.self_id, 1), 25) → 1,2 ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x19" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(25, sql:Row(self, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x19\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(self, 2), 25) → 2,1 ["\x04\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x19" → "\x01\x05\x02\x02\x04\x02\x02"]
delete mvcc:TxnWrite(25, sql:Index(self.self_id, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x19\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(25, sql:Index(self.self_id, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x19\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(25, sql:Row(self, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x19\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(25) ["\x01\x00\x00\x00\x00\x00\x00\x00\x19"]

[ops]> UPDATE self SET self_id = 2 WHERE id = 3
---
set mvcc:NextVersion → 27 ["\x00" → "\x1b"]
set mvcc:TxnActive(26) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x1a" → ""]
set mvcc:TxnWrite(26, sql:Index(self.self_id, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x1a\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(self.self_id, NULL), 26) → None ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1a" → "\x00"]
set mvcc:TxnWrite(26, sql:Index(self.self_id, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x1a\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Index(self.self_id, 2), 26) → 3 ["\x04\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1a" → "\x01\x03\x01\x02\x06"]
set mvcc:TxnWrite(26, sql:Row(self, 3)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x1a\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00" → ""]
set mvcc:Version(sql:Row(self, 3), 26) → 3,2 ["\x04\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x1a" → "\x01\x05\x02\x02\x06\x02\x04"]
delete mvcc:TxnWrite(26, sql:Index(self.self_id, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x1a\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(26, sql:Index(self.self_id, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x1a\x01self\x00\xff\x00\xffself_id\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnWrite(26, sql:Row(self, 3)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x1a\x02self\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x03\x00\x00"]
delete mvcc:TxnActive(26) ["\x01\x00\x00\x00\x00\x00\x00\x00\x1a"]

# Breaking the reference isn't.
!> UPDATE self SET id = 4 WHERE id = 1
!> UPDATE self SET id = 4 WHERE id = 2
---
Error: invalid input: row referenced by self.id=2
Error: invalid input: row referenced by self.id=3

# Not even when only this row points to itself.
> UPDATE self SET self_id = NULL WHERE id > 1
!> UPDATE self SET id = 4 WHERE id = 1
---
Error: invalid input: reference 1 not in table self

# Updates can't violate foreign key references in intermediate states even if
# the final state retains foreign key integrity. Postgres can't either.
> SELECT * FROM "int"
---
-1
0
1

> INSERT INTO name (id, "int") VALUES (2, -1), (3, 0), (4, 1)
> SELECT * FROM name
---
1, NULL, NULL, NULL, NULL
2, NULL, -1, NULL, NULL
3, NULL, 0, NULL, NULL
4, NULL, 1, NULL, NULL

!> UPDATE "int" SET id = -id
---
Error: invalid input: row referenced by name.id=2


================================================
FILE: src/sql/testscripts/writes/update_unique
================================================
# Tests UPDATE unique index writes.

> CREATE TABLE "unique" ( \
    id INT PRIMARY KEY, \
    "bool" BOOL UNIQUE, \
    "int" INT UNIQUE, \
    "float" FLOAT UNIQUE, \
    "string" STRING UNIQUE \
)
> INSERT INTO "unique" VALUES (1, false, 1, 3.14, 'a')
> INSERT INTO "unique" VALUES (2, NULL, NULL, NULL, NULL)
---
ok

# An UPDATE updates all indexes.
[ops]> UPDATE "unique" \
    SET "bool" = true, "int" = 2, "float" = 2.718, "string" = 'b' \
    WHERE id = 2
---
set mvcc:NextVersion → 5 ["\x00" → "\x05"]
set mvcc:TxnActive(4) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x04" → ""]
set mvcc:TxnWrite(4, sql:Index(unique.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, NULL), 4) → None ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x00"]
set mvcc:TxnWrite(4, sql:Index(unique.bool, TRUE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, TRUE), 4) → 2 ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(4, sql:Index(unique.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, NULL), 4) → None ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x00"]
set mvcc:TxnWrite(4, sql:Index(unique.int, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, 2), 4) → 2 ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(4, sql:Index(unique.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NULL), 4) → None ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x00"]
set mvcc:TxnWrite(4, sql:Index(unique.float, 2.718)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, 2.718), 4) → 2 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(4, sql:Index(unique.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, NULL), 4) → None ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x00"]
set mvcc:TxnWrite(4, sql:Index(unique.string, 'b')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, 'b'), 4) → 2 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(4, sql:Row(unique, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 2), 4) → 2,TRUE,2,2.718,'b' ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x13\x05\x02\x04\x01\x01\x02\x04\x03X9\xb4\xc8v\xbe\x05@\x04\x01b"]
delete mvcc:TxnWrite(4, sql:Index(unique.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(4, sql:Index(unique.bool, TRUE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00"]
delete mvcc:TxnWrite(4, sql:Index(unique.float, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(4, sql:Index(unique.float, 2.718)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00"]
delete mvcc:TxnWrite(4, sql:Index(unique.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(4, sql:Index(unique.int, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnWrite(4, sql:Index(unique.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(4, sql:Index(unique.string, 'b')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(4, sql:Row(unique, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(4) ["\x01\x00\x00\x00\x00\x00\x00\x00\x04"]

# An update that violates the unique constraint fails.
!> UPDATE "unique" SET "bool" = FALSE WHERE id = 2
!> UPDATE "unique" SET "int" = 1 WHERE id = 2
!> UPDATE "unique" SET "float" = 3.14 WHERE id = 2
!> UPDATE "unique" SET "string" = 'a' WHERE id = 2
---
Error: invalid input: value FALSE already in unique column bool
Error: invalid input: value 1 already in unique column int
Error: invalid input: value 3.14 already in unique column float
Error: invalid input: value 'a' already in unique column string

# It also fails when updating all rows.
!> UPDATE "unique" SET "bool" = FALSE
!> UPDATE "unique" SET "int" = 7
!> UPDATE "unique" SET "float" = 0.0
!> UPDATE "unique" SET "string" = 'abc'
---
Error: invalid input: value FALSE already in unique column bool
Error: invalid input: value 7 already in unique column int
Error: invalid input: value 0.0 already in unique column float
Error: invalid input: value 'abc' already in unique column string

# Updates with NULLS sets NULL entries. Duplicates are allowed.
[ops]> UPDATE "unique" SET "bool" = NULL, "int" = NULL, "float" = NULL, "string" = NULL
---
set mvcc:NextVersion → 14 ["\x00" → "\x0e"]
set mvcc:TxnActive(13) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\r" → ""]
set mvcc:TxnWrite(13, sql:Index(unique.bool, FALSE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, FALSE), 13) → None ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x00"]
set mvcc:TxnWrite(13, sql:Index(unique.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, NULL), 13) → 1 ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(13, sql:Index(unique.int, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, 1), 13) → None ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x00"]
set mvcc:TxnWrite(13, sql:Index(unique.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, NULL), 13) → 1 ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(13, sql:Index(unique.float, 3.14)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, 3.14), 13) → None ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x00"]
set mvcc:TxnWrite(13, sql:Index(unique.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NULL), 13) → 1 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(13, sql:Index(unique.string, 'a')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04a\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, 'a'), 13) → None ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04a\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x00"]
set mvcc:TxnWrite(13, sql:Index(unique.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, NULL), 13) → 1 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(13, sql:Row(unique, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 1), 13) → 1,NULL,NULL,NULL,NULL ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x01\x07\x05\x02\x02\x00\x00\x00\x00"]
set mvcc:TxnWrite(13, sql:Index(unique.bool, TRUE)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, TRUE), 13) → None ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x00"]
set mvcc:TxnWrite(13, sql:Index(unique.bool, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.bool, NULL), 13) → 1,2 ["\x04\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(13, sql:Index(unique.int, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, 2), 13) → None ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x00"]
set mvcc:TxnWrite(13, sql:Index(unique.int, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.int, NULL), 13) → 1,2 ["\x04\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(13, sql:Index(unique.float, 2.718)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, 2.718), 13) → None ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x00"]
set mvcc:TxnWrite(13, sql:Index(unique.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NULL), 13) → 1,2 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(13, sql:Index(unique.string, 'b')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, 'b'), 13) → None ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x00"]
set mvcc:TxnWrite(13, sql:Index(unique.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, NULL), 13) → 1,2 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(13, sql:Row(unique, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 2), 13) → 2,NULL,NULL,NULL,NULL ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\r" → "\x01\x07\x05\x02\x04\x00\x00\x00\x00"]
delete mvcc:TxnWrite(13, sql:Index(unique.bool, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(13, sql:Index(unique.bool, FALSE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x00\xff\x00\x00"]
delete mvcc:TxnWrite(13, sql:Index(unique.bool, TRUE)) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffbool\x00\xff\x00\xff\x01\x01\x00\x00"]
delete mvcc:TxnWrite(13, sql:Index(unique.float, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(13, sql:Index(unique.float, 2.718)) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\x05\xbev\xc8\xb49X\x00\x00"]
delete mvcc:TxnWrite(13, sql:Index(unique.float, 3.14)) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xc0\t\x1e\xb8Q\xeb\x85\x1f\x00\x00"]
delete mvcc:TxnWrite(13, sql:Index(unique.int, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(13, sql:Index(unique.int, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(13, sql:Index(unique.int, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffint\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnWrite(13, sql:Index(unique.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(13, sql:Index(unique.string, 'a')) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04a\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(13, sql:Index(unique.string, 'b')) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04b\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(13, sql:Row(unique, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(13, sql:Row(unique, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\r\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(13) ["\x01\x00\x00\x00\x00\x00\x00\x00\r"]

> SELECT * FROM "unique"
---
1, NULL, NULL, NULL, NULL
2, NULL, NULL, NULL, NULL

# Float NaNs are considered different and allowed.
[ops]> UPDATE "unique" SET "float" = NAN
---
set mvcc:NextVersion → 15 ["\x00" → "\x0f"]
set mvcc:TxnActive(14) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x0e" → ""]
set mvcc:TxnWrite(14, sql:Index(unique.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NULL), 14) → 2 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(14, sql:Index(unique.float, NaN)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NaN), 14) → 1 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(14, sql:Row(unique, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 1), 14) → 1,NULL,NULL,NaN,NULL ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x01\x0f\x05\x02\x02\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf8\x7f\x00"]
set mvcc:TxnWrite(14, sql:Index(unique.float, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NULL), 14) → None ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x00"]
set mvcc:TxnWrite(14, sql:Index(unique.float, NaN)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NaN), 14) → 1,2 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x01\x05\x02\x02\x02\x02\x04"]
set mvcc:TxnWrite(14, sql:Row(unique, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 2), 14) → 2,NULL,NULL,NaN,NULL ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0e" → "\x01\x0f\x05\x02\x04\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf8\x7f\x00"]
delete mvcc:TxnWrite(14, sql:Index(unique.float, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(14, sql:Index(unique.float, NaN)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(14, sql:Row(unique, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnWrite(14, sql:Row(unique, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0e\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(14) ["\x01\x00\x00\x00\x00\x00\x00\x00\x0e"]

> SELECT * FROM "unique"
---
1, NULL, NULL, NaN, NULL
2, NULL, NULL, NaN, NULL

# Float 0.0 and -0.0 are considered equal.
[ops]> UPDATE "unique" SET "float" = -0.0 WHERE id = 1
!> UPDATE "unique" SET "float" = 0.0 WHERE id = 2
---
set mvcc:NextVersion → 16 ["\x00" → "\x10"]
set mvcc:TxnActive(15) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x0f" → ""]
set mvcc:TxnWrite(15, sql:Index(unique.float, NaN)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NaN), 15) → 2 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(15, sql:Index(unique.float, 0.0)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, 0.0), 15) → 1 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(15, sql:Row(unique, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 1), 15) → 1,NULL,NULL,0.0,NULL ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0f" → "\x01\x0f\x05\x02\x02\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"]
delete mvcc:TxnWrite(15, sql:Index(unique.float, 0.0)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(15, sql:Index(unique.float, NaN)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(15, sql:Row(unique, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x0f\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnActive(15) ["\x01\x00\x00\x00\x00\x00\x00\x00\x0f"]
Error: invalid input: value 0.0 already in unique column float

> SELECT * FROM "unique"
---
1, NULL, NULL, 0.0, NULL
2, NULL, NULL, NaN, NULL

# Float INFINITY is considered equal.
[ops]> UPDATE "unique" SET "float" = INFINITY WHERE id = 1
[ops]> UPDATE "unique" SET "float" = -INFINITY WHERE id = 2
---
set mvcc:NextVersion → 18 ["\x00" → "\x12"]
set mvcc:TxnActive(17) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x11" → ""]
set mvcc:TxnWrite(17, sql:Index(unique.float, 0.0)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x11\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, 0.0), 17) → None ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11" → "\x00"]
set mvcc:TxnWrite(17, sql:Index(unique.float, inf)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x11\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf0\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, inf), 17) → 1 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf0\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(17, sql:Row(unique, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x11\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 1), 17) → 1,NULL,NULL,inf,NULL ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x11" → "\x01\x0f\x05\x02\x02\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf0\x7f\x00"]
delete mvcc:TxnWrite(17, sql:Index(unique.float, 0.0)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x11\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(17, sql:Index(unique.float, inf)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x11\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf0\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(17, sql:Row(unique, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x11\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnActive(17) ["\x01\x00\x00\x00\x00\x00\x00\x00\x11"]
set mvcc:NextVersion → 19 ["\x00" → "\x13"]
set mvcc:TxnActive(18) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x12" → ""]
set mvcc:TxnWrite(18, sql:Index(unique.float, NaN)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, NaN), 18) → None ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x12" → "\x00"]
set mvcc:TxnWrite(18, sql:Index(unique.float, -inf)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x00\xff\x0f\xff\xff\xff\xff\xff\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.float, -inf), 18) → 2 ["\x04\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x00\xff\x0f\xff\xff\xff\xff\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x12" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(18, sql:Row(unique, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 2), 18) → 2,NULL,NULL,-inf,NULL ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x12" → "\x01\x0f\x05\x02\x04\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf0\xff\x00"]
delete mvcc:TxnWrite(18, sql:Index(unique.float, -inf)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\x00\xff\x0f\xff\xff\xff\xff\xff\xff\x00\x00"]
delete mvcc:TxnWrite(18, sql:Index(unique.float, NaN)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x01unique\x00\xff\x00\xfffloat\x00\xff\x00\xff\x03\xff\xf8\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(18, sql:Row(unique, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x12\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(18) ["\x01\x00\x00\x00\x00\x00\x00\x00\x12"]

> SELECT * FROM "unique"
---
1, NULL, NULL, inf, NULL
2, NULL, NULL, -inf, NULL

!> UPDATE "unique" SET "float" = INFINITY WHERE id = 2
---
Error: invalid input: value inf already in unique column float

# Empty strings are considered equal.
> UPDATE "unique" SET "string" = '' WHERE id = 1
!> UPDATE "unique" SET "string" = '' WHERE id = 2
---
Error: invalid input: value '' already in unique column string

# Case differences are not considered equal.
[ops]> UPDATE "unique" SET "string" = 'case' WHERE id = 1
[ops]> UPDATE "unique" SET "string" = 'CaSe' WHERE id = 2
---
set mvcc:NextVersion → 23 ["\x00" → "\x17"]
set mvcc:TxnActive(22) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x16" → ""]
set mvcc:TxnWrite(22, sql:Index(unique.string, '')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x16\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, ''), 22) → None ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x16" → "\x00"]
set mvcc:TxnWrite(22, sql:Index(unique.string, 'case')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x16\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04case\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, 'case'), 22) → 1 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04case\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x16" → "\x01\x03\x01\x02\x02"]
set mvcc:TxnWrite(22, sql:Row(unique, 1)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x16\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 1), 22) → 1,NULL,NULL,inf,'case' ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x16" → "\x01\x14\x05\x02\x02\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf0\x7f\x04\x04case"]
delete mvcc:TxnWrite(22, sql:Index(unique.string, '')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x16\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(22, sql:Index(unique.string, 'case')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x16\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04case\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(22, sql:Row(unique, 1)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x16\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x01\x00\x00"]
delete mvcc:TxnActive(22) ["\x01\x00\x00\x00\x00\x00\x00\x00\x16"]
set mvcc:NextVersion → 24 ["\x00" → "\x18"]
set mvcc:TxnActive(23) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x17" → ""]
set mvcc:TxnWrite(23, sql:Index(unique.string, NULL)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x17\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, NULL), 23) → None ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17" → "\x00"]
set mvcc:TxnWrite(23, sql:Index(unique.string, 'CaSe')) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x17\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04CaSe\x00\xff\x00\xff\x00\x00" → ""]
set mvcc:Version(sql:Index(unique.string, 'CaSe'), 23) → 2 ["\x04\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04CaSe\x00\xff\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17" → "\x01\x03\x01\x02\x04"]
set mvcc:TxnWrite(23, sql:Row(unique, 2)) → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x17\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
set mvcc:Version(sql:Row(unique, 2), 23) → 2,NULL,NULL,-inf,'CaSe' ["\x04\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x17" → "\x01\x14\x05\x02\x04\x00\x00\x03\x00\x00\x00\x00\x00\x00\xf0\xff\x04\x04CaSe"]
delete mvcc:TxnWrite(23, sql:Index(unique.string, NULL)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x17\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(23, sql:Index(unique.string, 'CaSe')) ["\x03\x00\x00\x00\x00\x00\x00\x00\x17\x01unique\x00\xff\x00\xffstring\x00\xff\x00\xff\x04CaSe\x00\xff\x00\xff\x00\x00"]
delete mvcc:TxnWrite(23, sql:Row(unique, 2)) ["\x03\x00\x00\x00\x00\x00\x00\x00\x17\x02unique\x00\xff\x00\xff\x02\x80\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00"]
delete mvcc:TxnActive(23) ["\x01\x00\x00\x00\x00\x00\x00\x00\x17"]

> SELECT * FROM "unique"
---
1, NULL, NULL, inf, 'case'
2, NULL, NULL, -inf, 'CaSe'

# An UPDATE errors if intermediate states violate the uniqueness constraints,
# even if the final state wouldn't. This is the same in Postgres.
> UPDATE "unique" SET "bool" = id = 2
> SELECT * FROM "unique"
---
1, FALSE, NULL, inf, 'case'
2, TRUE, NULL, -inf, 'CaSe'

!> UPDATE "unique" SET "bool" = NOT "bool"
---
Error: invalid input: value TRUE already in unique column bool


================================================
FILE: src/sql/testscripts/writes/update_where
================================================
# Tests UPDATE with WHERE predicates.

# Create a table with some data.
> CREATE TABLE name (id INT PRIMARY KEY, value STRING, "index" INT INDEX)
> INSERT INTO name VALUES (1, 'a', 1), (2, 'b', 2), (3, 'c', NULL);
---
ok

# Boolean filters work, and are trivial.
> BEGIN
[plan]> UPDATE name SET value = 'foo' WHERE TRUE
> SELECT * FROM name
> ROLLBACK
---
Update: name (value='foo')
└─ Scan: name
1, 'foo', 1
2, 'foo', 2
3, 'foo', NULL

[plan]> UPDATE name SET value = 'foo' WHERE FALSE
> SELECT * FROM name
---
Update: name (value='foo')
└─ Nothing
1, 'a', 1
2, 'b', 2
3, 'c', NULL

# Updating by primary key lookup.
> BEGIN
[plan]> UPDATE name SET value = 'foo' WHERE id = 1 OR id = 3
> SELECT * FROM name
> ROLLBACK
---
Update: name (value='foo')
└─ KeyLookup: name (1, 3)
1, 'foo', 1
2, 'b', 2
3, 'foo', NULL

# Updating by index lookup.
> BEGIN
[plan]> UPDATE name SET value = 'foo' WHERE "index" = 2
> SELECT * FROM name
> ROLLBACK
---
Update: name (value='foo')
└─ IndexLookup: name.index (2)
1, 'a', 1
2, 'foo', 2
3, 'c', NULL

# Including IS NULL.
> BEGIN
[plan]> UPDATE name SET value = 'foo' WHERE "index" IS NULL
> SELECT * FROM name
> ROLLBACK
---
Update: name (value='foo')
└─ IndexLookup: name.index (NULL)
1, 'a', 1
2, 'b', 2
3, 'foo', NULL

# Updating by arbitrary predicate over full scan.
> BEGIN
[plan]> UPDATE name SET value = 'foo' WHERE id >= 5 - 2 OR (value LIKE 'a') IS NULL
> SELECT * FROM name
> ROLLBACK
---
Update: name (value='foo')
└─ Scan: name (name.id > 3 OR name.id = 3 OR name.value LIKE 'a' IS NULL)
1, 'a', 1
2, 'b', 2
3, 'foo', NULL

# Non-boolean predicates error, except NULL which is equivalent to FALSE.
!> UPDATE name SET value = 'foo' WHERE 0
!> UPDATE name SET value = 'foo' WHERE 1
!> UPDATE name SET value = 'foo' WHERE 3.14
!> UPDATE name SET value = 'foo' WHERE NaN
!> UPDATE name SET value = 'foo' WHERE ''
!> UPDATE name SET value = 'foo' WHERE 'true
---
Error: invalid input: filter returned 0, expected boolean
Error: invalid input: filter returned 1, expected boolean
Error: invalid input: filter returned 3.14, expected boolean
Error: invalid input: filter returned NaN, expected boolean
Error: invalid input: filter returned '', expected boolean
Error: invalid input: unexpected end of string literal

> UPDATE name SET value = 'foo' WHERE NULL
> SELECT * FROM name
---
1, 'a', 1
2, 'b', 2
3, 'c', NULL

# Bare WHERE errors.
!> UPDATE name SET value = 'foo' WHERE
---
Error: invalid input: unexpected end of input

# Missing column errors.
!> UPDATE name SET value = 'foo' WHERE missing = 'foo'
---
Error: invalid input: unknown column missing


================================================
FILE: src/sql/types/expression.rs
================================================
use std::fmt::Display;

use regex::Regex;
use serde::{Deserialize, Serialize};

use super::{Label, Row, Value};
use crate::errinput;
use crate::error::Result;
use crate::sql::planner::Node;

/// An expression, made up of nested operations and values. Values are either
/// constants, or numeric column references which are looked up in rows.
/// Evaluated to a final value during query execution.
///
/// Since this is a recursive data structure, we have to box each child
/// expression, which incurs a heap allocation per expression node. There are
/// clever ways to avoid this, but we keep it simple.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum Expression {
    /// A constant value.
    Constant(Value),
    /// A column reference. Looks up the value in a row during evaluation.
    Column(usize),

    /// a AND b: logical AND of two booleans.
    And(Box<Expression>, Box<Expression>),
    /// a OR b: logical OR of two booleans.
    Or(Box<Expression>, Box<Expression>),
    /// NOT a: logical NOT of a boolean.
    Not(Box<Expression>),

    /// a = b: equality comparison of two values.
    Equal(Box<Expression>, Box<Expression>),
    /// Greater than comparison of two values: a > b.
    GreaterThan(Box<Expression>, Box<Expression>),
    /// a < b: less than comparison of two values.
    LessThan(Box<Expression>, Box<Expression>),
    /// a IS NULL or a IS NAN: checks for the given value.
    Is(Box<Expression>, Value),

    /// a + b: adds two numbers.
    Add(Box<Expression>, Box<Expression>),
    /// a / b: divides two numbers.
    Divide(Box<Expression>, Box<Expression>),
    /// a ^b: exponentiates two numbers.
    Exponentiate(Box<Expression>, Box<Expression>),
    /// a!: takes the factorial of a number (4! = 4*3*2*1).
    Factorial(Box<Expression>),
    /// +a: the identify function, which simply returns the same number.
    Identity(Box<Expression>),
    /// a * b: multiplies two numbers.
    Multiply(Box<Expression>, Box<Expression>),
    /// -a: negates the given number.
    Negate(Box<Expression>),
    /// a % b: the remainder after dividing two numbers.
    Remainder(Box<Expression>, Box<Expression>),
    /// √a: takes the square root of a number.
    SquareRoot(Box<Expression>),
    /// a - b: subtracts two numbers.
    Subtract(Box<Expression>, Box<Expression>),

    // a LIKE b: checks if a string matches a pattern.
    Like(Box<Expression>, Box<Expression>),
}

impl Expression {
    /// Displays the expression, using the given plan node to look up labels for
    /// column references.
    pub fn display<'a>(&'a self, node: &'a Node) -> ExpressionDisplay<'a> {
        ExpressionDisplay::new(self, node, 0)
    }

    /// Evaluates an expression, returning a constant value. Column references
    /// are looked up in the given row (or panic if the row is None).
    pub fn evaluate(&self, row: Option<&Row>) -> Result<Value> {
        use Value::*;

        Ok(match self {
            // Constant values return themselves.
            Self::Constant(value) => value.clone(),

            // Column references look up a row value. The planner ensures that
            // only constant expressions are evaluated without a row.
            Self::Column(index) => row.and_then(|r| r.get(*index)).cloned().expect("invalid index"),

            // Logical AND. Inputs must be boolean or NULL. NULLs generally
            // yield NULL, except the special case NULL AND false == false.
            Self::And(lhs, rhs) => match (lhs.evaluate(row)?, rhs.evaluate(row)?) {
                (Boolean(lhs), Boolean(rhs)) => Boolean(lhs && rhs),
                (Boolean(b), Null) | (Null, Boolean(b)) if !b => Boolean(false),
                (Boolean(_), Null) | (Null, Boolean(_)) | (Null, Null) => Null,
                (lhs, rhs) => return errinput!("can't AND {lhs} and {rhs}"),
            },

            // Logical OR. Inputs must be boolean or NULL. NULLs generally
            // yield NULL, except the special case NULL OR true == true.
            Self::Or(lhs, rhs) => match (lhs.evaluate(row)?, rhs.evaluate(row)?) {
                (Boolean(lhs), Boolean(rhs)) => Boolean(lhs || rhs),
                (Boolean(b), Null) | (Null, Boolean(b)) if b => Boolean(true),
                (Boolean(_), Null) | (Null, Boolean(_)) | (Null, Null) => Null,
                (lhs, rhs) => return errinput!("can't OR {lhs} and {rhs}"),
            },

            // Logical NOT. Input must be boolean or NULL.
            Self::Not(expr) => match expr.evaluate(row)? {
                Boolean(b) => Boolean(!b),
                Null => Null,
                value => return errinput!("can't NOT {value}"),
            },

            // Comparisons. Must be of same type, except floats and integers
            // which are interchangeable. NULLs yield NULL, NaNs yield NaN.
            //
            // Does not dispatch to Value.cmp() because comparison and sorting
            // is different for Nulls and NaNs in SQL and code.
            #[allow(clippy::float_cmp)]
            Self::Equal(lhs, rhs) => match (lhs.evaluate(row)?, rhs.evaluate(row)?) {
                (Boolean(lhs), Boolean(rhs)) => Boolean(lhs == rhs),
                (Integer(lhs), Integer(rhs)) => Boolean(lhs == rhs),
                (Integer(lhs), Float(rhs)) => Boolean(lhs as f64 == rhs),
                (Float(lhs), Integer(rhs)) => Boolean(lhs == rhs as f64),
                (Float(lhs), Float(rhs)) => Boolean(lhs == rhs),
                (String(lhs), String(rhs)) => Boolean(lhs == rhs),
                (Null, _) | (_, Null) => Null,
                (lhs, rhs) => return errinput!("can't compare {lhs} and {rhs}"),
            },

            Self::GreaterThan(lhs, rhs) => match (lhs.evaluate(row)?, rhs.evaluate(row)?) {
                #[allow(clippy::bool_comparison)]
                (Boolean(lhs), Boolean(rhs)) => Boolean(lhs > rhs),
                (Integer(lhs), Integer(rhs)) => Boolean(lhs > rhs),
                (Integer(lhs), Float(rhs)) => Boolean(lhs as f64 > rhs),
                (Float(lhs), Integer(rhs)) => Boolean(lhs > rhs as f64),
                (Float(lhs), Float(rhs)) => Boolean(lhs > rhs),
                (String(lhs), String(rhs)) => Boolean(lhs > rhs),
                (Null, _) | (_, Null) => Null,
                (lhs, rhs) => return errinput!("can't compare {lhs} and {rhs}"),
            },

            Self::LessThan(lhs, rhs) => match (lhs.evaluate(row)?, rhs.evaluate(row)?) {
                #[allow(clippy::bool_comparison)]
                (Boolean(lhs), Boolean(rhs)) => Boolean(lhs < rhs),
                (Integer(lhs), Integer(rhs)) => Boolean(lhs < rhs),
                (Integer(lhs), Float(rhs)) => Boolean((lhs as f64) < rhs),
                (Float(lhs), Integer(rhs)) => Boolean(lhs < rhs as f64),
                (Float(lhs), Float(rhs)) => Boolean(lhs < rhs),
                (String(lhs), String(rhs)) => Boolean(lhs < rhs),
                (Null, _) | (_, Null) => Null,
                (lhs, rhs) => return errinput!("can't compare {lhs} and {rhs}"),
            },

            Self::Is(expr, Null) => Boolean(expr.evaluate(row)? == Null),
            Self::Is(expr, Float(f)) if f.is_nan() => match expr.evaluate(row)? {
                Float(f) => Boolean(f.is_nan()),
                Null => Null,
                v => return errinput!("IS NAN can't be used with {}", v.datatype().unwrap()),
            },
            Self::Is(_, v) => panic!("invalid IS value {v}"), // enforced by parser

            // Mathematical operations. Inputs must be numbers, but integers and
            // floats are interchangeable (float when mixed). NULLs yield NULL.
            // Errors on integer overflow, but floats yield infinity or NaN.
            Self::Add(lhs, rhs) => lhs.evaluate(row)?.checked_add(&rhs.evaluate(row)?)?,
            Self::Divide(lhs, rhs) => lhs.evaluate(row)?.checked_div(&rhs.evaluate(row)?)?,
            Self::Exponentiate(lhs, rhs) => lhs.evaluate(row)?.checked_pow(&rhs.evaluate(row)?)?,
            Self::Factorial(expr) => match expr.evaluate(row)? {
                Integer(i @ 0..) => {
                    (1..=i).try_fold(Integer(1), |p, i| p.checked_mul(&Integer(i)))?
                }
                Null => Null,
                value => return errinput!("can't take factorial of {value}"),
            },
            Self::Identity(expr) => match expr.evaluate(row)? {
                value @ (Integer(_) | Float(_) | Null) => value,
                expr => return errinput!("can't take the identity of {expr}"),
            },
            Self::Multiply(lhs, rhs) => lhs.evaluate(row)?.checked_mul(&rhs.evaluate(row)?)?,
            Self::Negate(expr) => match expr.evaluate(row)? {
                Integer(i) => Integer(-i),
                Float(f) => Float(-f),
                Null => Null,
                value => return errinput!("can't negate {value}"),
            },
            Self::Remainder(lhs, rhs) => lhs.evaluate(row)?.checked_rem(&rhs.evaluate(row)?)?,
            Self::SquareRoot(expr) => match expr.evaluate(row)? {
                Integer(i @ 0..) => Float((i as f64).sqrt()),
                Float(f) => Float(f.sqrt()),
                Null => Null,
                value => return errinput!("can't take square root of {value}"),
            },
            Self::Subtract(lhs, rhs) => lhs.evaluate(row)?.checked_sub(&rhs.evaluate(row)?)?,

            // LIKE pattern matching, using _ and % as single- and
            // multi-character wildcards. Inputs must be strings. NULLs yield
            // NULL. There's no support for escaping an _ and %.
            Self::Like(lhs, rhs) => match (lhs.evaluate(row)?, rhs.evaluate(row)?) {
                (String(lhs), String(rhs)) => {
                    // We could precompile the pattern if it's constant, instead
                    // of recompiling it for every row, but we keep it simple.
                    let pattern =
                        format!("^{}$", regex::escape(&rhs).replace('%', ".*").replace('_', "."));
                    Boolean(Regex::new(&pattern)?.is_match(&lhs))
                }
                (String(_), Null) | (Null, String(_)) | (Null, Null) => Null,
                (lhs, rhs) => return errinput!("can't LIKE {lhs} and {rhs}"),
            },
        })
    }

    /// Recursively walks the expression tree depth-first, calling the given
    /// closure until it returns false. Returns true otherwise.
    pub fn walk(&self, visitor: &mut impl FnMut(&Expression) -> bool) -> bool {
        if !visitor(self) {
            return false;
        }
        match self {
            Self::Add(lhs, rhs)
            | Self::And(lhs, rhs)
            | Self::Divide(lhs, rhs)
            | Self::Equal(lhs, rhs)
            | Self::Exponentiate(lhs, rhs)
            | Self::GreaterThan(lhs, rhs)
            | Self::LessThan(lhs, rhs)
            | Self::Like(lhs, rhs)
            | Self::Multiply(lhs, rhs)
            | Self::Or(lhs, rhs)
            | Self::Remainder(lhs, rhs)
            | Self::Subtract(lhs, rhs) => lhs.walk(visitor) && rhs.walk(visitor),

            Self::Factorial(expr)
            | Self::Identity(expr)
            | Self::Is(expr, _)
            | Self::Negate(expr)
            | Self::Not(expr)
            | Self::SquareRoot(expr) => expr.walk(visitor),

            Self::Constant(_) | Self::Column(_) => true,
        }
    }

    /// Recursively walks the expression tree depth-first, calling the given
    /// closure until it returns true. Returns false otherwise. This is the
    /// inverse of walk().
    pub fn contains(&self, visitor: &impl Fn(&Expression) -> bool) -> bool {
        !self.walk(&mut |e| !visitor(e))
    }

    /// Transforms the expression by recursively applying the given closures
    /// depth-first to each node before/after descending.
    pub fn transform(
        mut self,
        before: &impl Fn(Self) -> Result<Self>,
        after: &impl Fn(Self) -> Result<Self>,
    ) -> Result<Self> {
        // Helper for transforming boxed expressions.
        let xform = |mut expr: Box<Expression>| -> Result<Box<Expression>> {
            *expr = expr.transform(before, after)?;
            Ok(expr)
        };

        self = before(self)?;
        self = match self {
            Self::Add(lhs, rhs) => Self::Add(xform(lhs)?, xform(rhs)?),
            Self::And(lhs, rhs) => Self::And(xform(lhs)?, xform(rhs)?),
            Self::Divide(lhs, rhs) => Self::Divide(xform(lhs)?, xform(rhs)?),
            Self::Equal(lhs, rhs) => Self::Equal(xform(lhs)?, xform(rhs)?),
            Self::Exponentiate(lhs, rhs) => Self::Exponentiate(xform(lhs)?, xform(rhs)?),
            Self::GreaterThan(lhs, rhs) => Self::GreaterThan(xform(lhs)?, xform(rhs)?),
            Self::LessThan(lhs, rhs) => Self::LessThan(xform(lhs)?, xform(rhs)?),
            Self::Like(lhs, rhs) => Self::Like(xform(lhs)?, xform(rhs)?),
            Self::Multiply(lhs, rhs) => Self::Multiply(xform(lhs)?, xform(rhs)?),
            Self::Or(lhs, rhs) => Self::Or(xform(lhs)?, xform(rhs)?),
            Self::Remainder(lhs, rhs) => Self::Remainder(xform(lhs)?, xform(rhs)?),
            Self::SquareRoot(expr) => Self::SquareRoot(xform(expr)?),
            Self::Subtract(lhs, rhs) => Self::Subtract(xform(lhs)?, xform(rhs)?),

            Self::Factorial(expr) => Self::Factorial(xform(expr)?),
            Self::Identity(expr) => Self::Identity(xform(expr)?),
            Self::Is(expr, value) => Self::Is(xform(expr)?, value),
            Self::Negate(expr) => Self::Negate(xform(expr)?),
            Self::Not(expr) => Self::Not(xform(expr)?),

            expr @ (Self::Constant(_) | Self::Column(_)) => expr,
        };
        self = after(self)?;
        Ok(self)
    }

    /// Converts the expression into conjunctive normal form, i.e. an AND of
    /// ORs, useful during plan optimization. This is done by converting to
    /// negation normal form and then applying De Morgan's distributive law.
    pub fn into_cnf(self) -> Self {
        use Expression::{And, Or};

        let xform = |expr| {
            // Can't use a single match; needs deref patterns.
            let Or(lhs, rhs) = expr else {
                return expr;
            };
            match (*lhs, *rhs) {
                // (x AND y) OR z → (x OR z) AND (y OR z)
                (And(l, r), rhs) => And(Or(l, rhs.clone().into()).into(), Or(r, rhs.into()).into()),
                // x OR (y AND z) → (x OR y) AND (x OR z)
                (lhs, And(l, r)) => And(Or(lhs.clone().into(), l).into(), Or(lhs.into(), r).into()),
                // Otherwise, do nothing.
                (lhs, rhs) => Or(lhs.into(), rhs.into()),
            }
        };
        self.into_nnf().transform(&|e| Ok(xform(e)), &Ok).unwrap() // infallible
    }

    /// Converts the expression into conjunctive normal form as a vector of
    /// ANDed expressions (instead of nested ANDs).
    pub fn into_cnf_vec(self) -> Vec<Self> {
        let mut cnf = Vec::new();
        let mut stack = vec![self.into_cnf()];
        while let Some(expr) = stack.pop() {
            if let Self::And(lhs, rhs) = expr {
                stack.extend([*rhs, *lhs]); // push lhs last to pop it first
            } else {
                cnf.push(expr);
            }
        }
        cnf
    }

    /// Converts the expression into negation normal form. This pushes NOT
    /// operators into the tree using De Morgan's laws, such that they're always
    /// below other logical operators. It is a useful intermediate form for
    /// applying other logical normalizations.
    pub fn into_nnf(self) -> Self {
        use Expression::{And, Not, Or};

        let xform = |expr| {
            // Can't use a single match; needs deref patterns.
            let Not(inner) = expr else {
                return expr;
            };
            match *inner {
                // NOT (x AND y) → (NOT x) OR (NOT y)
                And(lhs, rhs) => Or(Not(lhs).into(), Not(rhs).into()),
                // NOT (x OR y) → (NOT x) AND (NOT y)
                Or(lhs, rhs) => And(Not(lhs).into(), Not(rhs).into()),
                // NOT NOT x → x
                Not(inner) => *inner,
                // Otherwise, do nothing.
                expr => Not(expr.into()),
            }
        };
        self.transform(&|e| Ok(xform(e)), &Ok).unwrap() // infallible
    }

    /// Creates an expression by ANDing together a vector, or None if empty.
    pub fn and_vec(exprs: Vec<Expression>) -> Option<Self> {
        let mut iter = exprs.into_iter();
        let mut expr = iter.next()?;
        for rhs in iter {
            expr = Expression::And(expr.into(), rhs.into());
        }
        Some(expr)
    }

    /// Checks if an expression is a single column lookup (i.e. a disjunction of
    /// = or IS NULL/NAN for a single column), returning the column index.
    pub fn is_column_lookup(&self) -> Option<usize> {
        use Expression::*;

        match &self {
            // Column/constant equality can use index lookups. NULL and NaN are
            // handled in into_column_values().
            Equal(lhs, rhs) => match (lhs.as_ref(), rhs.as_ref()) {
                (Column(c), Constant(_)) | (Constant(_), Column(c)) => Some(*c),
                _ => None,
            },
            // IS NULL and IS NAN can use index lookups.
            Is(expr, _) => match expr.as_ref() {
                Column(c) => Some(*c),
                _ => None,
            },
            // All OR branches must be lookups on the same column:
            // id = 1 OR id = 2 OR id = 3.
            Or(lhs, rhs) => match (lhs.is_column_lookup(), rhs.is_column_lookup()) {
                (Some(l), Some(r)) if l == r => Some(l),
                _ => None,
            },
            _ => None,
        }
    }

    /// Extracts column lookup values for the given column. Panics if the
    /// expression isn't a lookup of the given column, i.e. is_column_lookup()
    /// must return true for the expression.
    pub fn into_column_values(self, index: usize) -> Vec<Value> {
        use Expression::*;

        match self {
            Equal(lhs, rhs) => match (*lhs, *rhs) {
                (Column(column), Constant(value)) | (Constant(value), Column(column)) => {
                    assert_eq!(column, index, "unexpected column");
                    // NULL and NAN index lookups are for IS NULL and IS NAN.
                    // Equality shouldn't match anything, return empty vec.
                    if value.is_undefined() { Vec::new() } else { vec![value] }
                }
                (lhs, rhs) => panic!("unexpected expression {:?}", Equal(lhs.into(), rhs.into())),
            },
            // IS NULL and IS NAN can use index lookups.
            Is(expr, value) => match *expr {
                Column(column) => {
                    assert_eq!(column, index, "unexpected column");
                    vec![value]
                }
                expr => panic!("unexpected expression {expr:?}"),
            },
            Or(lhs, rhs) => {
                let mut values = lhs.into_column_values(index);
                values.extend(rhs.into_column_values(index));
                values
            }
            expr => panic!("unexpected expression {expr:?}"),
        }
    }

    /// Replaces column references from → to.
    pub fn replace_column(self, from: usize, to: usize) -> Self {
        let xform = |expr| match expr {
            Expression::Column(i) if i == from => Expression::Column(to),
            expr => expr,
        };
        self.transform(&|e| Ok(xform(e)), &Ok).unwrap() // infallible
    }

    /// Shifts column references by the given amount (can be negative).
    pub fn shift_column(self, diff: isize) -> Self {
        let xform = |expr| match expr {
            Expression::Column(i) => Expression::Column((i as isize + diff) as usize),
            expr => expr,
        };
        self.transform(&|e| Ok(xform(e)), &Ok).unwrap() // infallible
    }
}

// NB: Display can't look up column labels, and will print numeric column
// indexes instead. Use Expression::display() instead to print with labels
// resolved from a given plan node.
impl Display for Expression {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        self.display(&Node::Nothing { columns: Vec::new() }).fmt(f)
    }
}

// Helper to display expressions. Groups with () as needed by precedence rules,
// and looks up column labels in the given plan node.
pub struct ExpressionDisplay<'a> {
    expr: &'a Expression,
    node: &'a Node,
    parent_precedence: u8,
}

impl<'a> Display for ExpressionDisplay<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        use Expression::*;

        // Group the expression if its precedence is lower than the parent.
        let precedence = Self::precedence(self.expr);
        if precedence < self.parent_precedence {
            write!(f, "(")?;
        }

        // Helper to display a boxed, grouped expression.
        let group = |expr: &'a Expression| ExpressionDisplay::new(expr, self.node, precedence);

        match self.expr {
            Constant(value) => write!(f, "{value}")?,
            Column(index) => match self.node.column_label(*index) {
                Label::None => write!(f, "#{index}")?,
                label => write!(f, "{label}")?,
            },

            And(lhs, rhs) => write!(f, "{} AND {}", group(lhs), group(rhs))?,
            Or(lhs, rhs) => write!(f, "{} OR {}", group(lhs), group(rhs))?,
            Not(expr) => write!(f, "NOT {}", group(expr))?,

            Equal(lhs, rhs) => write!(f, "{} = {}", group(lhs), group(rhs))?,
            GreaterThan(lhs, rhs) => write!(f, "{} > {}", group(lhs), group(rhs))?,
            LessThan(lhs, rhs) => write!(f, "{} < {}", group(lhs), group(rhs))?,
            Is(expr, Value::Null) => write!(f, "{} IS NULL", group(expr))?,
            Is(expr, Value::Float(n)) if n.is_nan() => write!(f, "{} IS NAN", group(expr))?,
            Is(_, v) => panic!("unexpected IS value {v}"),

            Add(lhs, rhs) => write!(f, "{} + {}", group(lhs), group(rhs))?,
            Divide(lhs, rhs) => write!(f, "{} / {}", group(lhs), group(rhs))?,
            Exponentiate(lhs, rhs) => write!(f, "{} ^ {}", group(lhs), group(rhs))?,
            Factorial(expr) => write!(f, "{}!", group(expr))?,
            Identity(expr) => write!(f, "{}", group(expr))?,
            Multiply(lhs, rhs) => write!(f, "{} * {}", group(lhs), group(rhs))?,
            Negate(expr) => write!(f, "-{}", group(expr))?,
            Remainder(lhs, rhs) => write!(f, "{} % {}", group(lhs), group(rhs))?,
            SquareRoot(expr) => write!(f, "sqrt({})", group(expr))?,
            Subtract(lhs, rhs) => write!(f, "{} - {}", group(lhs), group(rhs))?,

            Like(lhs, rhs) => write!(f, "{} LIKE {}", group(lhs), group(rhs))?,
        }

        if precedence < self.parent_precedence {
            write!(f, ")")?;
        }

        Ok(())
    }
}

impl<'a> ExpressionDisplay<'a> {
    // Creates a new expression display.
    pub fn new(expr: &'a Expression, node: &'a Node, parent_precedence: u8) -> Self {
        Self { expr, node, parent_precedence }
    }

    // Precedence levels for () grouping. Matches the parser.
    fn precedence(expr: &Expression) -> u8 {
        use Expression::*;
        match expr {
            Column(_) | Constant(_) | SquareRoot(_) => 11,
            Identity(_) | Negate(_) => 10,
            Factorial(_) => 9,
            Exponentiate(_, _) => 8,
            Multiply(_, _) | Divide(_, _) | Remainder(_, _) => 7,
            Add(_, _) | Subtract(_, _) => 6,
            GreaterThan(_, _) | LessThan(_, _) => 5,
            Equal(_, _) | Like(_, _) | Is(_, _) => 4,
            Not(_) => 3,
            And(_, _) => 2,
            Or(_, _) => 1,
        }
    }
}

impl From<Value> for Expression {
    fn from(value: Value) -> Self {
        Expression::Constant(value)
    }
}

impl From<Value> for Box<Expression> {
    fn from(value: Value) -> Self {
        Box::new(value.into())
    }
}


================================================
FILE: src/sql/types/mod.rs
================================================
//! The SQL data model, including data types, expressions, and schema objects.

mod expression;
mod schema;
mod value;

pub use expression::Expression;
pub use schema::{Column, Table};
pub use value::{DataType, Label, Row, Rows, Value};


================================================
FILE: src/sql/types/schema.rs
================================================
use std::fmt::Display;

use serde::{Deserialize, Serialize};

use super::{DataType, Row, Value};
use crate::encoding;
use crate::errinput;
use crate::error::Result;
use crate::sql::engine::{Catalog, Transaction};
use crate::sql::parser::is_ident;

/// A table schema, which specifies its data structure and constraints.
///
/// Tables can't change after they are created. There is no ALTER TABLE nor
/// CREATE/DROP INDEX, only CREATE TABLE and DROP TABLE.
#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)]
pub struct Table {
    /// The table name. Unique identifier for the table. Can't be empty.
    pub name: String,
    /// The primary key column index. A table must have a primary key, and it
    /// can only be a single column.
    pub primary_key: usize,
    /// The table's columns. Must have at least one.
    pub columns: Vec<Column>,
}

impl encoding::Value for Table {}

/// A table column.
#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)]
pub struct Column {
    /// Column name. Can't be empty.
    pub name: String,
    /// Column datatype.
    pub datatype: DataType,
    /// Whether the column allows null values. Not legal for primary keys.
    pub nullable: bool,
    /// The column's default value. If None, the user must specify an explicit
    /// value. Must match the column datatype. Nullable columns require a
    /// default (often Null). Null is only a valid default when nullable.
    pub default: Option<Value>,
    /// Whether the column should only allow unique values (ignoring NULLs).
    /// Must be true for a primary key column. Requires index.
    pub unique: bool,
    /// Whether the column should have a secondary index. Must be false for
    /// primary keys, which are the primary index. Must be true for unique or
    /// reference columns.
    pub index: bool,
    /// If set, this column is a foreign key reference to the given table's
    /// primary key. Must be of the same type as the target primary key.
    /// Requires index.
    pub references: Option<String>,
}

// Formats the table as a SQL CREATE TABLE statement.
impl Display for Table {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        /// Formats an identifier as valid SQL, quoting it if necessary.
        fn format_ident(ident: &str) -> String {
            match is_ident(ident) {
                true => ident.to_string(),
                false => format!("\"{}\"", ident.replace('\"', "\"\"")),
            }
        }

        writeln!(f, "CREATE TABLE {} (", format_ident(&self.name))?;
        for (i, column) in self.columns.iter().enumerate() {
            write!(f, "  {} {}", format_ident(&column.name), column.datatype)?;
            if i == self.primary_key {
                write!(f, " PRIMARY KEY")?;
            } else if !column.nullable {
                write!(f, " NOT NULL")?;
            }
            if let Some(default) = &column.default {
                write!(f, " DEFAULT {default}")?;
            }
            if i != self.primary_key {
                if column.unique {
                    write!(f, " UNIQUE")?;
                }
                if column.index {
                    write!(f, " INDEX")?;
                }
            }
            if let Some(reference) = &column.references {
                write!(f, " REFERENCES {reference}")?;
            }
            if i < self.columns.len() - 1 {
                write!(f, ",")?;
            }
            writeln!(f)?;
        }
        write!(f, ")")
    }
}

impl Table {
    /// Validates the table schema, using the catalog to validate foreign key
    /// references.
    pub fn validate(&self, catalog: &impl Catalog) -> Result<()> {
        if self.name.is_empty() {
            return errinput!("table name can't be empty");
        }
        if self.columns.is_empty() {
            return errinput!("table has no columns");
        }
        if self.columns.get(self.primary_key).is_none() {
            return errinput!("invalid primary key index");
        }

        for (i, column) in self.columns.iter().enumerate() {
            if column.name.is_empty() {
                return errinput!("column name can't be empty");
            }
            let (cname, ctype) = (&column.name, &column.datatype); // for formatting convenience

            // Validate primary key.
            let is_primary_key = i == self.primary_key;
            if is_primary_key {
                if column.nullable {
                    return errinput!("primary key {cname} cannot be nullable");
                }
                if !column.unique {
                    return errinput!("primary key {cname} must be unique");
                }
                if column.index {
                    return errinput!("primary key {cname} can't have an index");
                }
            }

            // Validate default value.
            match column.default.as_ref().map(|v| v.datatype()) {
                None if column.nullable => {
                    return errinput!("nullable column {cname} must have a default value");
                }
                Some(None) if !column.nullable => {
                    return errinput!("invalid NULL default for non-nullable column {cname}");
                }
                Some(Some(vtype)) if vtype != column.datatype => {
                    return errinput!("invalid default type {vtype} for {ctype} column {cname}");
                }
                Some(_) | None => {}
            }

            // Validate unique index.
            if column.unique && !column.index && !is_primary_key {
                return errinput!("unique column {cname} must have a secondary index");
            }

            // Validate references.
            if let Some(reference) = &column.references {
                if !column.index && !is_primary_key {
                    return errinput!("reference column {cname} must have a secondary index");
                }
                let reftype = if reference == &self.name {
                    self.columns[self.primary_key].datatype
                } else if let Some(target) = catalog.get_table(reference)? {
                    target.columns[target.primary_key].datatype
                } else {
                    return errinput!("unknown table {reference} referenced by column {cname}");
                };
                if column.datatype != reftype {
                    return errinput!(
                        "can't reference {reftype} primary key of {reference} from {ctype} column {cname}"
                    );
                }
            }
        }
        Ok(())
    }

    /// Validates a row, including uniqueness and reference checks using the
    /// given transaction.
    ///
    /// If update is true, the row replaces an existing entry with the same
    /// primary key. Otherwise, it is an insert. Primary key changes are
    /// implemented as a delete+insert.
    ///
    /// Validating uniqueness and references individually for each row is not
    /// performant, but it's fine for our purposes.
    pub fn validate_row(&self, row: &Row, update: bool, txn: &impl Transaction) -> Result<()> {
        if row.len() != self.columns.len() {
            return errinput!("invalid row size for table {}", self.name);
        }

        // Validate primary key.
        let id = &row[self.primary_key];
        let idslice = &row[self.primary_key..=self.primary_key];
        if id.is_undefined() {
            return errinput!("invalid primary key {id}");
        }
        if !update && !txn.get(&self.name, idslice)?.is_empty() {
            return errinput!("primary key {id} already exists");
        }

        for (i, (column, value)) in self.columns.iter().zip(row).enumerate() {
            let (cname, ctype) = (&column.name, &column.datatype);
            let valueslice = &row[i..=i];

            // Validate datatype.
            if let Some(ref vtype) = value.datatype()
                && vtype != ctype
            {
                return errinput!("invalid datatype {vtype} for {ctype} column {cname}");
            }
            if value == &Value::Null && !column.nullable {
                return errinput!("NULL value not allowed for column {cname}");
            }

            // Validate outgoing references.
            if let Some(target) = &column.references {
                match value {
                    // NB: NaN is not a valid primary key, and not valid as a
                    // missing foreign key marker.
                    Value::Null => {}
                    v if target == &self.name && v == id => {}
                    v if txn.get(target, valueslice)?.is_empty() => {
                        return errinput!("reference {v} not in table {target}");
                    }
                    _ => {}
                }
            }

            // Validate uniqueness constraints. Unique columns are indexed.
            if column.unique && i != self.primary_key && !value.is_undefined() {
                let mut index = txn.lookup_index(&self.name, &column.name, valueslice)?;
                if update {
                    index.remove(id); // ignore existing version of this row
                }
                if !index.is_empty() {
                    return errinput!("value {value} already in unique column {cname}");
                }
            }
        }
        Ok(())
    }
}


================================================
FILE: src/sql/types/value.rs
================================================
use std::borrow::Cow;
use std::cmp::{Eq, Ordering, PartialEq};
use std::fmt::Display;
use std::hash::{Hash, Hasher};
use std::result::Result as StdResult;

use dyn_clone::DynClone;
use serde::{Deserialize, Serialize, Serializer};

use crate::encoding;
use crate::error::{Error, Result};
use crate::sql::parser::ast;
use crate::{errdata, errinput};

/// A primitive SQL data type. For simplicity, only a handful of scalar types
/// are supported (no compound types).
#[derive(Clone, Copy, Debug, Hash, PartialEq, Serialize, Deserialize)]
pub enum DataType {
    /// A boolean: true or false.
    Boolean,
    /// A 64-bit signed integer.
    Integer,
    /// A 64-bit floating point number.
    Float,
    /// A UTF-8 encoded string.
    String,
}

impl Display for DataType {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
            Self::Boolean => write!(f, "BOOLEAN"),
            Self::Integer => write!(f, "INTEGER"),
            Self::Float => write!(f, "FLOAT"),
            Self::String => write!(f, "STRING"),
        }
    }
}

/// A primitive SQL value, represented as a native Rust type.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum Value {
    /// An unknown value of unknown type (i.e. SQL NULL).
    ///
    /// In code, Null is considered equal to Null, so that we can detect, index,
    /// and order these values. The SQL NULL semantics are implemented during
    /// Expression evaluation.
    Null,
    /// A boolean.
    Boolean(bool),
    /// A 64-bit signed integer.
    Integer(i64),
    /// A 64-bit floating point number.
    ///
    /// In code, NaN is considered equal to NaN, so that we can detect, index,
    /// and order these values. The SQL NAN semantics are implemented during
    /// Expression evaluation.
    ///
    /// -0.0 and -NaN are considered equal to their positive counterpart, and
    /// normalized as positive when serialized (for key lookups).
    Float(#[serde(serialize_with = "serialize_f64")] f64),
    /// A UTF-8 encoded string.
    String(String),
}

impl encoding::Value for Value {}

impl Display for Value {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        match self {
            Self::Null => f.write_str("NULL"),
            Self::Boolean(true) => f.write_str("TRUE"),
            Self::Boolean(false) => f.write_str("FALSE"),
            Self::Integer(integer) => integer.fmt(f),
            Self::Float(float) => write!(f, "{float:?}"),
            Self::String(string) => write!(f, "'{}'", string.escape_debug()),
        }
    }
}

/// Serialize f64 -0.0 and -NaN as positive, such that they're considered equal
/// in the key/value store (e.g. for index lookups).
fn serialize_f64<S: Serializer>(value: &f64, serializer: S) -> StdResult<S::Ok, S::Error> {
    let mut value = *value;
    if (value.is_nan() || value == 0.0) && value.is_sign_negative() {
        value = -value;
    }
    serializer.serialize_f64(value)
}

// Consider Nulls and ±NaNs equal. Rust already considers -0.0 == 0.0.
impl PartialEq for Value {
    fn eq(&self, other: &Self) -> bool {
        match (self, other) {
            (Self::Boolean(a), Self::Boolean(b)) => a == b,
            (Self::Integer(a), Self::Integer(b)) => a == b,
            (Self::Integer(a), Self::Float(b)) => *a as f64 == *b,
            (Self::Float(a), Self::Integer(b)) => *a == *b as f64,
            (Self::Float(a), Self::Float(b)) => a == b || a.is_nan() && b.is_nan(),
            (Self::String(a), Self::String(b)) => a == b,
            (Self::Null, Self::Null) => true,
            (_, _) => false,
        }
    }
}

impl Eq for Value {}

// Allow hashing Nulls and floats, and hash -0.0 and -NaN as positive.
impl Hash for Value {
    fn hash<H: Hasher>(&self, hasher: &mut H) {
        core::mem::discriminant(self).hash(hasher); // hash the type
        match self {
            Self::Null => {}
            Self::Boolean(v) => v.hash(hasher),
            Self::Integer(v) => v.hash(hasher),
            Self::Float(v) => {
                // Hash -NaN and -0.0 as positive.
                let mut v = *v;
                if (v.is_nan() || v == 0.0) && v.is_sign_negative() {
                    v = -v;
                }
                v.to_bits().hash(hasher)
            }
            Self::String(v) => v.hash(hasher),
        }
    }
}

// Consider Nulls and NaNs equal when ordering.
//
// We establish a total order across all types, even though mixed types will
// rarely/never come up: String > Integer/Float > Boolean > Null.
impl Ord for Value {
    fn cmp(&self, other: &Self) -> Ordering {
        match (self, other) {
            (Self::Null, Self::Null) => Ordering::Equal,
            (Self::Boolean(a), Self::Boolean(b)) => a.cmp(b),
            (Self::Integer(a), Self::Integer(b)) => a.cmp(b),
            (Self::Integer(a), Self::Float(b)) => (*a as f64).total_cmp(b),
            (Self::Float(a), Self::Integer(b)) => a.total_cmp(&(*b as f64)),
            (Self::Float(a), Self::Float(b)) => a.total_cmp(b),
            (Self::String(a), Self::String(b)) => a.cmp(b),

            (Self::Null, _) => Ordering::Less,
            (_, Self::Null) => Ordering::Greater,
            (Self::Boolean(_), _) => Ordering::Less,
            (_, Self::Boolean(_)) => Ordering::Greater,
            (Self::Float(_), _) => Ordering::Less,
            (_, Self::Float(_)) => Ordering::Greater,
            (Self::Integer(_), _) => Ordering::Less,
            (_, Self::Integer(_)) => Ordering::Greater,
            // String is ordered last.
        }
    }
}

impl PartialOrd for Value {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

impl Value {
    /// Returns the value's datatype, or None for null values.
    pub fn datatype(&self) -> Option<DataType> {
        match self {
            Self::Null => None,
            Self::Boolean(_) => Some(DataType::Boolean),
            Self::Integer(_) => Some(DataType::Integer),
            Self::Float(_) => Some(DataType::Float),
            Self::String(_) => Some(DataType::String),
        }
    }

    /// Returns true if the value is undefined (NULL or NaN).
    pub fn is_undefined(&self) -> bool {
        match self {
            Self::Null => true,
            Self::Float(f) if f.is_nan() => true,
            _ => false,
        }
    }

    /// Adds two values. Errors if invalid.
    pub fn checked_add(&self, other: &Self) -> Result<Self> {
        use Value::*;

        Ok(match (self, other) {
            (Integer(lhs), Integer(rhs)) => match lhs.checked_add(*rhs) {
                Some(i) => Integer(i),
                None => return errinput!("integer overflow"),
            },
            (Integer(lhs), Float(rhs)) => Float(*lhs as f64 + rhs),
            (Float(lhs), Integer(rhs)) => Float(lhs + *rhs as f64),
            (Float(lhs), Float(rhs)) => Float(lhs + rhs),
            (Null, Integer(_) | Float(_) | Null) => Null,
            (Integer(_) | Float(_), Null) => Null,
            (lhs, rhs) => return errinput!("can't add {lhs} and {rhs}"),
        })
    }

    /// Divides two values. Errors if invalid.
    pub fn checked_div(&self, other: &Self) -> Result<Self> {
        use Value::*;

        Ok(match (self, other) {
            (Integer(_), Integer(0)) => return errinput!("can't divide by zero"),
            (Integer(lhs), Integer(rhs)) => Integer(lhs / rhs),
            (Integer(lhs), Float(rhs)) => Float(*lhs as f64 / rhs),
            (Float(lhs), Integer(rhs)) => Float(lhs / *rhs as f64),
            (Float(lhs), Float(rhs)) => Float(lhs / rhs),
            (Null, Integer(_) | Float(_) | Null) => Null,
            (Integer(_) | Float(_), Null) => Null,
            (lhs, rhs) => return errinput!("can't divide {lhs} and {rhs}"),
        })
    }

    /// Multiplies two values. Errors if invalid.
    pub fn checked_mul(&self, other: &Self) -> Result<Self> {
        use Value::*;

        Ok(match (self, other) {
            (Integer(lhs), Integer(rhs)) => match lhs.checked_mul(*rhs) {
                Some(i) => Integer(i),
                None => return errinput!("integer overflow"),
            },
            (Integer(lhs), Float(rhs)) => Float(*lhs as f64 * rhs),
            (Float(lhs), Integer(rhs)) => Float(lhs * *rhs as f64),
            (Float(lhs), Float(rhs)) => Float(lhs * rhs),
            (Null, Integer(_) | Float(_) | Null) => Null,
            (Integer(_) | Float(_), Null) => Null,
            (lhs, rhs) => return errinput!("can't multiply {lhs} and {rhs}"),
        })
    }

    /// Exponentiates two values. Errors if invalid.
    pub fn checked_pow(&self, other: &Self) -> Result<Self> {
        use Value::*;

        Ok(match (self, other) {
            (Integer(lhs), Integer(rhs)) if *rhs >= 0 => {
                let rhs = (*rhs).try_into().or_else(|_| errinput!("integer overflow"))?;
                match lhs.checked_pow(rhs) {
                    Some(i) => Integer(i),
                    None => return errinput!("integer overflow"),
                }
            }
            (Integer(lhs), Integer(rhs)) => Float((*lhs as f64).powf(*rhs as f64)),
            (Integer(lhs), Float(rhs)) => Float((*lhs as f64).powf(*rhs)),
            (Float(lhs), Integer(rhs)) => Float((lhs).powi(*rhs as i32)),
            (Float(lhs), Float(rhs)) => Float((lhs).powf(*rhs)),
            (Integer(_) | Float(_), Null) => Null,
            (Null, Integer(_) | Float(_) | Null) => Null,
            (lhs, rhs) => return errinput!("can't exponentiate {lhs} and {rhs}"),
        })
    }

    /// Finds the remainder of two values. Errors if invalid.
    ///
    /// NB: uses the remainder, not modulo, like Postgres. This means that for
    /// negative values, the result has the sign of the dividend, rather than
    /// always returning a positive value.
    pub fn checked_rem(&self, other: &Self) -> Result<Self> {
        use Value::*;

        Ok(match (self, other) {
            (Integer(_), Integer(0)) => return errinput!("can't divide by zero"),
            (Integer(lhs), Integer(rhs)) => Integer(lhs % rhs),
            (Integer(lhs), Float(rhs)) => Float(*lhs as f64 % rhs),
            (Float(lhs), Integer(rhs)) => Float(lhs % *rhs as f64),
            (Float(lhs), Float(rhs)) => Float(lhs % rhs),
            (Integer(_) | Float(_) | Null, Null) => Null,
            (Null, Integer(_) | Float(_)) => Null,
            (lhs, rhs) => return errinput!("can't take remainder of {lhs} and {rhs}"),
        })
    }

    /// Subtracts two values. Errors if invalid.
    pub fn checked_sub(&self, other: &Self) -> Result<Self> {
        use Value::*;

        Ok(match (self, other) {
            (Integer(lhs), Integer(rhs)) => match lhs.checked_sub(*rhs) {
                Some(i) => Integer(i),
                None => return errinput!("integer overflow"),
            },
            (Integer(lhs), Float(rhs)) => Float(*lhs as f64 - rhs),
            (Float(lhs), Integer(rhs)) => Float(lhs - *rhs as f64),
            (Float(lhs), Float(rhs)) => Float(lhs - rhs),
            (Null, Integer(_) | Float(_) | Null) => Null,
            (Integer(_) | Float(_), Null) => Null,
            (lhs, rhs) => return errinput!("can't subtract {lhs} and {rhs}"),
        })
    }
}

impl From<bool> for Value {
    fn from(v: bool) -> Self {
        Value::Boolean(v)
    }
}

impl From<f64> for Value {
    fn from(v: f64) -> Self {
        Value::Float(v)
    }
}

impl From<i64> for Value {
    fn from(v: i64) -> Self {
        Value::Integer(v)
    }
}

impl From<String> for Value {
    fn from(v: String) -> Self {
        Value::String(v)
    }
}

impl From<&str> for Value {
    fn from(v: &str) -> Self {
        Value::String(v.to_owned())
    }
}

impl TryFrom<Value> for bool {
    type Error = Error;

    fn try_from(value: Value) -> Result<Self> {
        let Value::Boolean(b) = value else {
            return errdata!("not a boolean: {value}");
        };
        Ok(b)
    }
}

impl TryFrom<Value> for f64 {
    type Error = Error;

    fn try_from(value: Value) -> Result<Self> {
        let Value::Float(f) = value else {
            return errdata!("not a float: {value}");
        };
        Ok(f)
    }
}

impl TryFrom<Value> for i64 {
    type Error = Error;

    fn try_from(value: Value) -> Result<Self> {
        let Value::Integer(i) = value else {
            return errdata!("not an integer: {value}");
        };
        Ok(i)
    }
}

impl TryFrom<Value> for String {
    type Error = Error;

    fn try_from(value: Value) -> Result<Self> {
        let Value::String(s) = value else {
            return errdata!("not a string: {value}");
        };
        Ok(s)
    }
}

impl<'a> From<&'a Value> for Cow<'a, Value> {
    fn from(v: &'a Value) -> Self {
        Cow::Borrowed(v)
    }
}

/// A row of values.
pub type Row = Vec<Value>;

/// A row iterator.
pub type Rows = Box<dyn RowIterator>;

/// A row iterator trait, which requires the iterator to be both clonable and
/// object-safe. Cloning allows resetting an iterator back to an initial state,
/// e.g. for nested loop joins. It uses a blanket implementation, and relies on
/// dyn_clone to allow cloning trait objects.
pub trait RowIterator: Iterator<Item = Result<Row>> + DynClone {}

dyn_clone::clone_trait_object!(RowIterator);

impl<I: Iterator<Item = Result<Row>> + DynClone> RowIterator for I {}

/// A column label, used in query results and plans.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum Label {
    /// No label.
    None,
    /// An unqualified column name.
    Unqualified(String),
    /// A fully qualified table/column name.
    Qualified(String, String),
}

impl Display for Label {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::None => write!(f, ""),
            Self::Unqualified(name) => write!(f, "{name}"),
            Self::Qualified(table, column) => write!(f, "{table}.{column}"),
        }
    }
}

impl Label {
    /// Formats the label as a short column header.
    pub fn as_header(&self) -> &str {
        match self {
            Self::Qualified(_, column) | Self::Unqualified(column) => column.as_str(),
            Self::None => "?",
        }
    }
}

impl From<Label> for ast::Expression {
    /// Builds an ast::Expression::Column for a label. Can't be None.
    fn from(label: Label) -> Self {
        match label {
            Label::Qualified(table, column) => ast::Expression::Column(Some(table), column),
            Label::Unqualified(column) => ast::Expression::Column(None, column),
            Label::None => panic!("can't convert None label to AST expression"),
        }
    }
}

impl From<Option<String>> for Label {
    fn from(name: Option<String>) -> Self {
        name.map(Label::Unqualified).unwrap_or(Label::None)
    }
}


================================================
FILE: src/storage/bitcask.rs
================================================
use std::collections::BTreeMap;
use std::collections::btree_map::Range;
use std::fs::File;
use std::io::{BufReader, BufWriter, Read as _, Seek as _, SeekFrom, Write as _};
use std::ops::{Bound, RangeBounds};
use std::path::PathBuf;
use std::result::Result as StdResult;

use fs4::fs_std::FileExt;
use log::{error, info};

use super::{Engine, Status};
use crate::error::{Error, Result};

/// A very simple variant of BitCask, itself a simple log-structured key-value
/// engine used e.g. by the Riak database. This is not compatible with BitCask
/// databases generated by other implementations. See:
/// <https://riak.com/assets/bitcask-intro.pdf>
///
/// BitCask writes key-value pairs to an append-only log file, and keeps a
/// mapping of keys to file offsets in memory. All live keys must fit in memory.
/// Deletes write a tombstone value to the log file. To remove old garbage
/// (deleted or replaced keys), logs can be compacted by writing new logs
/// containing only live data, dropping replaced values and tombstones.
///
/// This implementation is significantly simpler than standard BitCask:
///
/// * Instead of writing multiple fixed-size log files, it uses a single
///   append-only log file of arbitrary size. This increases the compaction
///   volume, since the entire log file must be rewritten on every compaction.
///   It can also exceed the filesystem's file size limit. However, toyDB
///   databases are expected to be small.
///
/// * Compactions lock the database for reads and writes. This is ok since toyDB
///   only compacts during node startup and files are expected to be small.
///
/// * Hint files are not used, the log itself is scanned when opened to
///   build the keydir. Hint files only omit values, and toyDB values are
///   expected to be small, so the hint files would be nearly as large as
///   the compacted log files themselves.
///
/// * Log entries don't contain timestamps or checksums.
///
/// The structure of an encoded log entry is:
///
/// 1. Key length as big-endian u32 [4 bytes].
/// 2. Value length as big-endian i32, or -1 for tombstones [4 bytes].
/// 3. Key as raw bytes [<= 2 GB].
/// 4. Value as raw bytes [<= 2 GB].
pub struct BitCask {
    /// The current append-only log file.
    log: Log,
    /// Maps keys to a value's offset and length in [`BitCask::log`].
    keydir: KeyDir,
}

/// Maps keys to a value's location in the log file.
type KeyDir = BTreeMap<Vec<u8>, ValueLocation>;

/// The location of a value in the log file.
#[derive(Clone, Copy)]
struct ValueLocation {
    offset: u64,
    length: usize,
}

impl ValueLocation {
    fn end(&self) -> u64 {
        self.offset + self.length as u64
    }
}

impl BitCask {
    /// Opens or creates a BitCask database in the given file.
    pub fn new(path: PathBuf) -> Result<Self> {
        let mut log = Log::new(path.clone())?;
        let keydir = log.build_keydir()?;
        info!("Opened {} with {} live keys", path.display(), keydir.len());
        Ok(Self { log, keydir })
    }

    /// Opens a BitCask database, and automatically compacts it if the amount
    /// of garbage exceeds the given ratio and byte size when opened.
    pub fn new_maybe_compact(
        path: PathBuf,
        garbage_min_fraction: f64,
        garbage_min_bytes: u64,
    ) -> Result<Self> {
        let mut engine = Self::new(path)?;

        let status = engine.status()?;
        let total_size = status.disk_size;
        let garbage_size = status.garbage_disk_size();
        let garbage_fraction = garbage_size as f64 / total_size as f64;
        if garbage_size > 0
            && garbage_size >= garbage_min_bytes
            && garbage_fraction >= garbage_min_fraction
        {
            info!(
                "Compacting {} to remove {:.0}% garbage ({:.1} MB out of {:.1} MB)",
                engine.log.path.display(),
                garbage_fraction * 100.0,
                garbage_size as f64 / 1024.0 / 1024.0,
                total_size as f64 / 1024.0 / 1024.0
            );
            engine.compact()?;
            info!(
                "Compacted {} to size {:.1} MB",
                engine.log.path.display(),
                (total_size - garbage_size) as f64 / 1024.0 / 1024.0
            );
        }

        Ok(engine)
    }
}

impl Engine for BitCask {
    type ScanIterator<'a> = ScanIterator<'a>;

    fn delete(&mut self, key: &[u8]) -> Result<()> {
        self.log.write_entry(key, None)?;
        self.keydir.remove(key);
        Ok(())
    }

    fn flush(&mut self) -> Result<()> {
        // Don't fsync in tests, to speed them up. We disable this here, instead
        // of setting `raft::Log::fsync = false` in tests, because we want to
        // assert that the Raft log flushes to disk even if the flush is a noop.
        #[cfg(not(test))]
        self.log.file.sync_all()?;
        Ok(())
    }

    fn get(&mut self, key: &[u8]) -> Result<Option<Vec<u8>>> {
        let Some(location) = self.keydir.get(key) else {
            return Ok(None);
        };
        self.log.read_value(*location).map(Some)
    }

    fn scan(&mut self, range: impl RangeBounds<Vec<u8>>) -> Self::ScanIterator<'_> {
        ScanIterator { inner: self.keydir.range(range), log: &mut self.log }
    }

    fn scan_dyn(
        &mut self,
        range: (Bound<Vec<u8>>, Bound<Vec<u8>>),
    ) -> Box<dyn super::ScanIterator + '_> {
        Box::new(self.scan(range))
    }

    fn set(&mut self, key: &[u8], value: Vec<u8>) -> Result<()> {
        let value_location = self.log.write_entry(key, Some(&*value))?;
        self.keydir.insert(key.to_vec(), value_location);
        Ok(())
    }

    fn status(&mut self) -> Result<Status> {
        let keys = self.keydir.len() as u64;
        let size =
            self.keydir.iter().map(|(key, value_loc)| (key.len() + value_loc.length) as u64).sum();
        let disk_size = self.log.file.metadata()?.len();
        let live_disk_size = size + 8 * keys; // account for length prefixes
        Ok(Status { name: "bitcask".to_string(), keys, size, disk_size, live_disk_size })
    }
}

impl BitCask {
    /// Compacts the current log file by writing out a new log file containing
    /// only live keys and replacing the current file with it.
    pub fn compact(&mut self) -> Result<()> {
        // Create a new temporary log file, or truncate it if it already exists.
        let new_path = self.log.path.with_extension("new");
        let mut new_log = Log::new(new_path)?;
        new_log.file.set_len(0)?;

        // Write all live entries into the new log, and generate a new KeyDir.
        let mut new_keydir = KeyDir::new();
        for (key, value_loc) in &self.keydir {
            let value = self.log.read_value(*value_loc)?;
            let value_loc = new_log.write_entry(key, Some(&value))?;
            new_keydir.insert(key.clone(), value_loc);
        }

        // Replace the current log with the new one.
        std::fs::rename(&new_log.path, &self.log.path)?;
        new_log.path = self.log.path.clone();

        self.log = new_log;
        self.keydir = new_keydir;
        Ok(())
    }
}

/// Attempt to flush the file when the database is closed.
impl Drop for BitCask {
    fn drop(&mut self) {
        if let Err(error) = self.flush() {
            error!("failed to flush file: {}", error)
        }
    }
}

pub struct ScanIterator<'a> {
    inner: Range<'a, Vec<u8>, ValueLocation>,
    log: &'a mut Log,
}

impl ScanIterator<'_> {
    fn map(&mut self, item: (&Vec<u8>, &ValueLocation)) -> <Self as Iterator>::Item {
        let (key, value_loc) = item;
        Ok((key.clone(), self.log.read_value(*value_loc)?))
    }
}

impl Iterator for ScanIterator<'_> {
    type Item = Result<(Vec<u8>, Vec<u8>)>;

    fn next(&mut self) -> Option<Self::Item> {
        self.inner.next().map(|item| self.map(item))
    }
}

impl DoubleEndedIterator for ScanIterator<'_> {
    fn next_back(&mut self) -> Option<Self::Item> {
        self.inner.next_back().map(|item| self.map(item))
    }
}

/// A BitCask append-only log file, containing a sequence of key/value
/// entries encoded as follows;
///
/// 1. Key length as big-endian u32 [4 bytes].
/// 2. Value length as big-endian i32, or -1 for tombstones [4 bytes].
/// 3. Key as raw bytes [<= 2 GB].
/// 4. Value as raw bytes [<= 2 GB].
struct Log {
    /// The open log file.
    file: File,
    /// Path to the log file.
    path: PathBuf,
}

impl Log {
    /// Opens a log file, or creates one if it does not exist. Takes out an
    /// exclusive lock on the file until it is closed, or errors if the lock is
    /// already held.
    fn new(path: PathBuf) -> Result<Self> {
        if let Some(dir) = path.parent() {
            std::fs::create_dir_all(dir)?
        }
        let file = std::fs::OpenOptions::new()
            .read(true)
            .write(true)
            .create(true)
            .truncate(false)
            .open(&path)?;
        if !file.try_lock_exclusive()? {
            return Err(Error::IO(format!("file {path:?} is already is use")));
        }
        Ok(Self { file, path })
    }

    /// Builds a keydir by scanning the log file. If an incomplete entry is
    /// encountered, it is assumed to be caused by an incomplete write operation
    /// and the remainder of the file is truncated.
    fn build_keydir(&mut self) -> Result<KeyDir> {
        let mut len_buf = [0u8; 4];
        let mut keydir = KeyDir::new();
        let file_len = self.file.metadata()?.len();
        let mut r = BufReader::new(&mut self.file);
        let mut offset = r.seek(SeekFrom::Start(0))?;

        while offset < file_len {
            // Read the next entry from the file, returning the key and value
            // location, or None for tombstones.
            let result = || -> StdResult<(Vec<u8>, Option<ValueLocation>), std::io::Error> {
                // Read the key length: 4-byte u32.
                r.read_exact(&mut len_buf)?;
                let key_len = u32::from_be_bytes(len_buf);

                // Read the value length: 4-byte i32, -1 for tombstones.
                r.read_exact(&mut len_buf)?;
                let value_loc = match i32::from_be_bytes(len_buf) {
                    ..0 => None, // tombstone
                    len => Some(ValueLocation {
                        offset: offset + 8 + key_len as u64,
                        length: len as usize,
                    }),
                };

                // Read the key.
                let mut key = vec![0; key_len as usize];
                r.read_exact(&mut key)?;

                // Skip past the value.
                if let Some(value_loc) = value_loc {
                    if value_loc.end() > file_len {
                        return Err(std::io::Error::new(
                            std::io::ErrorKind::UnexpectedEof,
                            "value extends beyond end of file",
                        ));
                    }
                    r.seek_relative(value_loc.length as i64)?;
                }

                // Update the file offset.
                offset += 8 + key_len as u64 + value_loc.map_or(0, |v| v.length) as u64;

                Ok((key, value_loc))
            }();

            // Update the keydir with the entry.
            match result {
                Ok((key, Some(value_loc))) => keydir.insert(key, value_loc),
                Ok((key, None)) => keydir.remove(&key),
                // If an incomplete entry was found at the end of the file, assume an
                // incomplete write and truncate the file.
                Err(err) if err.kind() == std::io::ErrorKind::UnexpectedEof => {
                    error!("Found incomplete entry at offset {offset}, truncating file");
                    self.file.set_len(offset)?;
                    break;
                }
                Err(err) => return Err(err.into()),
            };
        }

        Ok(keydir)
    }

    /// Reads a value from the log file at the given location.
    fn read_value(&mut self, location: ValueLocation) -> Result<Vec<u8>> {
        let mut value = vec![0; location.length];
        self.file.seek(SeekFrom::Start(location.offset))?;
        self.file.read_exact(&mut value)?;
        Ok(value)
    }

    /// Appends a key/value entry to the log file, using a None value for
    /// tombstones. It returns the location of the entry's value in the log, for
    /// use with the [`KeyDir`].
    fn write_entry(&mut self, key: &[u8], value: Option<&[u8]>) -> Result<ValueLocation> {
        let length = 8 + key.len() + value.map_or(0, |v| v.len());
        let offset = self.file.seek(SeekFrom::End(0))?;
        let mut w = BufWriter::with_capacity(length, &mut self.file);

        // Key length: 4-byte u32.
        w.write_all(&(key.len() as u32).to_be_bytes())?;

        // Value length: 4-byte i32, -1 for tombstones.
        w.write_all(&value.map_or(-1, |v| v.len() as i32).to_be_bytes())?;

        // The actual key and value.
        w.write_all(key)?;
        w.write_all(value.unwrap_or_default())?;
        w.flush()?;

        // Translate the entry location into a value location.
        Ok(ValueLocation {
            offset: offset + 8 + key.len() as u64,
            length: value.map_or(0, |v| v.len()),
        })
    }
}

/// Most storage tests are Goldenscripts under src/storage/testscripts.
#[cfg(test)]
mod tests {
    use std::error::Error as StdError;
    use std::fmt::Write as _;

    use tempfile::TempDir;
    use test_each_file::test_each_path;

    use super::super::engine::test::Runner;
    use super::*;
    use crate::encoding::format::{self, Formatter as _};

    // Run common goldenscript tests in src/storage/testscripts/engine.
    test_each_path! { in "src/storage/testscripts/engine" as engine => test_goldenscript }

    // Also run BitCask-specific tests in src/storage/testscripts/bitcask.
    test_each_path! { in "src/storage/testscripts/bitcask" as scripts => test_goldenscript }

    fn test_goldenscript(path: &std::path::Path) {
        goldenscript::run(&mut BitCaskRunner::new(), path).expect("goldenscript failed")
    }

    /// Tests that exclusive locks are taken out on log files, erroring if held,
    /// and released when the database is closed.
    #[test]
    fn lock() -> Result<()> {
        let path = TempDir::with_prefix("toydb")?.path().join("bitcask");
        let engine = BitCask::new(path.clone()).expect("bitcask failed");

        // Opening another database with the same file should error.
        assert!(BitCask::new(path.clone()).is_err());

        // Opening another database after the current is closed works.
        drop(engine);
        assert!(BitCask::new(path).is_ok());
        Ok(())
    }

    /// Tests that a log with an incomplete write at the end can be recovered by
    /// discarding the last entry.
    #[test]
    fn recovery() -> Result<()> {
        // Create an initial log file with a few entries. Keep track of where
        // each entry ends.
        let dir = TempDir::with_prefix("toydb")?;
        let path = dir.path().join("complete");
        let mut log = Log::new(path.clone())?;

        let mut ends = vec![];
        let value_loc = log.write_entry("deleted".as_bytes(), Some(&[1, 2, 3]))?;
        ends.push(value_loc.end());
        let value_loc = log.write_entry("deleted".as_bytes(), None)?;
        ends.push(value_loc.end());
        let value_loc = log.write_entry(&[], Some(&[]))?;
        ends.push(value_loc.end());
        let value_loc = log.write_entry("key".as_bytes(), Some(&[1, 2, 3, 4, 5]))?;
        ends.push(value_loc.end());
        drop(log);

        // Copy the file, and truncate it at each byte, then try to open it
        // and assert that we always retain a prefix of entries.
        let truncpath = dir.path().join("truncated");
        let size = std::fs::metadata(&path)?.len();
        for pos in 0..=size {
            std::fs::copy(&path, &truncpath)?;
            let f = std::fs::OpenOptions::new().write(true).open(&truncpath)?;
            f.set_len(pos)?;
            drop(f);

            let mut expect = vec![];
            if pos >= ends[0] {
                expect.push((b"deleted".to_vec(), vec![1, 2, 3]))
            }
            if pos >= ends[1] {
                expect.pop(); // "deleted" key removed
            }
            if pos >= ends[2] {
                expect.push((b"".to_vec(), vec![]))
            }
            if pos >= ends[3] {
                expect.push((b"key".to_vec(), vec![1, 2, 3, 4, 5]))
            }

            let mut engine = BitCask::new(truncpath.clone())?;
            assert_eq!(expect, engine.scan(..).collect::<Result<Vec<_>>>()?);
        }
        Ok(())
    }

    /// Tests key/value sizes up to 64 MB.
    #[test]
    fn point_ops_sizes() -> Result<()> {
        let path = TempDir::with_prefix("toydb")?.path().join("bitcask");
        let mut engine = BitCask::new(path.clone()).expect("bitcask failed");

        // Generate keys/values for increasing powers of two.
        for size in (1..=26).map(|i| 1 << i) {
            let value = vec![b'x'; size];
            let key = value.as_slice();

            assert_eq!(engine.get(key)?, None);
            engine.set(key, value.clone())?;
            assert_eq!(engine.get(key)?.as_ref(), Some(&value));
            engine.delete(key)?;
            assert_eq!(engine.get(key)?, None);
        }
        Ok(())
    }

    /// A BitCask-specific goldenscript runner, which dispatches through to the
    /// standard Engine runner.
    struct BitCaskRunner {
        inner: Runner<BitCask>,
        tempdir: TempDir,
    }

    impl goldenscript::Runner for BitCaskRunner {
        fn run(&mut self, command: &goldenscript::Command) -> StdResult<String, Box<dyn StdError>> {
            let mut output = String::new();
            match command.name.as_str() {
                // compact
                // Compacts the BitCask entry log.
                "compact" => {
                    command.consume_args().reject_rest()?;
                    self.inner.engine.compact()?;
                }

                // dump
                // Dumps the full BitCask entry log.
                "dump" => {
                    command.consume_args().reject_rest()?;
                    self.dump(&mut output)?;
                }

                // reopen [compact_fraction=FLOAT]
                // Closes and reopens the BitCask database. If compact_ratio is
                // given, it specifies a garbage ratio beyond which the log
                // should be auto-compacted on open.
                "reopen" => {
                    let mut args = command.consume_args();
                    let compact_fraction = args.lookup_parse("compact_fraction")?;
                    args.reject_rest()?;
                    // We need to close the file before we can reopen it, which
                    // happens when the database is dropped. Replace the engine
                    // with a temporary empty engine then reopen the file.
                    let path = self.inner.engine.log.path.clone();
                    self.inner.engine = BitCask::new(self.tempdir.path().join("empty"))?;
                    if let Some(garbage_fraction) = compact_fraction {
                        self.inner.engine = BitCask::new_maybe_compact(path, garbage_fraction, 0)?;
                    } else {
                        self.inner.engine = BitCask::new(path)?;
                    }
                }

                // Pass other commands to the standard engine runner.
                _ => return self.inner.run(command),
            }
            Ok(output)
        }
    }

    impl BitCaskRunner {
        fn new() -> Self {
            let tempdir = TempDir::with_prefix("toydb").expect("tempdir failed");
            let engine = BitCask::new(tempdir.path().join("bitcask")).expect("bitcask failed");
            let inner = Runner::new(engine);
            Self { inner, tempdir }
        }

        /// Dumps the full BitCask entry log.
        fn dump(&mut self, output: &mut String) -> StdResult<(), Box<dyn StdError>> {
            let file = &mut self.inner.engine.log.file;
            let file_len = file.metadata()?.len();
            let mut r = BufReader::new(file);
            let mut pos = r.seek(SeekFrom::Start(0))?;
            let mut len_buf = [0; 4];
            let mut idx = 0;

            while pos < file_len {
                if idx > 0 {
                    writeln!(output, "--------")?;
                }
                write!(output, "{:<7}", format!("{idx}@{pos}"))?;

                r.read_exact(&mut len_buf)?;
                let key_len = u32::from_be_bytes(len_buf);
                write!(output, " keylen={key_len} [{}]", hex::encode(len_buf))?;

                r.read_exact(&mut len_buf)?;
                let value_len_or_tombstone = i32::from_be_bytes(len_buf); // NB: -1 for tombstones
                let value_len = value_len_or_tombstone.max(0) as u32;
                writeln!(output, " valuelen={value_len_or_tombstone} [{}]", hex::encode(len_buf))?;

                let mut key = vec![0; key_len as usize];
                r.read_exact(&mut key)?;
                let mut value = vec![0; value_len as usize];
                r.read_exact(&mut value)?;
                let size = 4 + 4 + key_len as u64 + value_len as u64;
                writeln!(
                    output,
                    "{:<7} key={} [{}] {}",
                    format!("{size}b"),
                    format::Raw::key(&key),
                    hex::encode(key),
                    match value_len_or_tombstone {
                        -1 => "tombstone".to_string(),
                        _ => format!(
                            "value={} [{}]",
                            format::Raw::bytes(&value),
                            hex::encode(&value)
                        ),
                    },
                )?;

                pos += size;
                idx += 1;
            }
            Ok(())
        }
    }
}


================================================
FILE: src/storage/engine.rs
================================================
use std::ops::{Bound, RangeBounds};

use serde::{Deserialize, Serialize};

use crate::encoding::keycode;
use crate::error::Result;

/// A key/value storage engine, which stores arbitrary byte strings. Keys are
/// maintained in lexicographical order, which allows for range scans. This is
/// needed e.g. to scan all rows in a specific SQL table (where all table rows
/// have a common key prefix), or to scan the tail of the Raft log (after a
/// given log entry index).
///
/// Keys should use the Keycode order-preserving encoding, see
/// [`crate::encoding::keycode`].
///
/// Writes are only guaranteed durable after calling [`Engine::flush()`].
///
/// For simplicity, this only supports a single user at a time, so all methods
/// (including reads) take a mutable reference. This isn't that big of a deal
/// since Raft execution is serial anyway.
pub trait Engine: Send {
    /// The iterator returned by [`Engine::scan`].
    type ScanIterator<'a>: ScanIterator + 'a
    where
        Self: Sized + 'a; // omit in trait objects, for dyn compatibility

    /// Deletes a key, or does nothing if it does not exist.
    fn delete(&mut self, key: &[u8]) -> Result<()>;

    /// Flushes any buffered data to disk.
    fn flush(&mut self) -> Result<()>;

    /// Gets a value for a key, if it exists.
    fn get(&mut self, key: &[u8]) -> Result<Option<Vec<u8>>>;

    /// Iterates over an ordered range of key/value pairs.
    fn scan(&mut self, range: impl RangeBounds<Vec<u8>>) -> Self::ScanIterator<'_>
    where
        Self: Sized; // omit in trait objects, for dyn compatibility

    /// Like scan, but can be used from trait objects (with dynamic dispatch).
    fn scan_dyn(&mut self, range: (Bound<Vec<u8>>, Bound<Vec<u8>>)) -> Box<dyn ScanIterator + '_>;

    /// Iterates over all key/value pairs starting with the given prefix.
    fn scan_prefix(&mut self, prefix: &[u8]) -> Self::ScanIterator<'_>
    where
        Self: Sized, // omit in trait objects, for dyn compatibility
    {
        self.scan(keycode::prefix_range(prefix))
    }

    /// Sets a value for a key, replacing the existing value if any.
    fn set(&mut self, key: &[u8], value: Vec<u8>) -> Result<()>;

    /// Returns the engine status.
    fn status(&mut self) -> Result<Status>;
}

/// A scan iterator over key/value pairs, returned by [`Engine::scan()`].
pub trait ScanIterator: DoubleEndedIterator<Item = Result<(Vec<u8>, Vec<u8>)>> {}

/// Blanket implementation for all iterators that can act as a scan iterator.
impl<I: DoubleEndedIterator<Item = Result<(Vec<u8>, Vec<u8>)>>> ScanIterator for I {}

/// Engine status.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Status {
    /// The name of the storage engine.
    pub name: String,
    /// The number of live keys in the engine.
    pub keys: u64,
    /// The logical size of live key/value pairs.
    pub size: u64,
    /// The on-disk size of all data, live and garbage.
    pub disk_size: u64,
    /// The on-disk size of live data, excluding garbage.
    pub live_disk_size: u64,
}

impl Status {
    /// The on-disk size of garbage data.
    pub fn garbage_disk_size(&self) -> u64 {
        self.disk_size - self.live_disk_size
    }

    /// The ratio of on-disk garbage to total size.
    pub fn garbage_disk_percent(&self) -> f64 {
        if self.disk_size == 0 {
            return 0.0;
        }
        self.garbage_disk_size() as f64 / self.disk_size as f64 * 100.0
    }
}

/// Test helpers for engines.
#[cfg(test)]
pub mod test {
    use std::error::Error as StdError;
    use std::fmt::Write as _;
    use std::ops::{Bound, RangeBounds};
    use std::result::Result as StdResult;

    use crossbeam::channel::Sender;
    use itertools::Itertools as _;
    use regex::Regex;

    use super::*;
    use crate::encoding::format::{self, Formatter as _};

    /// Goldenscript runner for engines. All engines use a common set of
    /// goldenscripts in src/storage/testscripts/engine, as well as their own
    /// engine-specific tests.
    pub struct Runner<E: Engine> {
        pub engine: E,
    }

    impl<E: Engine> Runner<E> {
        pub fn new(engine: E) -> Self {
            Self { engine }
        }
    }

    impl<E: Engine> goldenscript::Runner for Runner<E> {
        fn run(&mut self, command: &goldenscript::Command) -> StdResult<String, Box<dyn StdError>> {
            let mut output = String::new();
            match command.name.as_str() {
                // delete KEY
                "delete" => {
                    let mut args = command.consume_args();
                    let key = decode_binary(&args.next_pos().ok_or("key not given")?.value);
                    args.reject_rest()?;
                    self.engine.delete(&key)?;
                }

                // get KEY
                "get" => {
                    let mut args = command.consume_args();
                    let key = decode_binary(&args.next_pos().ok_or("key not given")?.value);
                    args.reject_rest()?;
                    let value = self.engine.get(&key)?;
                    writeln!(output, "{}", format::Raw::key_maybe_value(&key, value.as_deref()))?;
                }

                // scan [reverse=BOOL] RANGE
                "scan" => {
                    let mut args = command.consume_args();
                    let reverse = args.lookup_parse("reverse")?.unwrap_or(false);
                    let range =
                        parse_key_range(args.next_pos().map(|a| a.value.as_str()).unwrap_or(".."))?;
                    args.reject_rest()?;
                    let items: Vec<_> = if reverse {
                        self.engine.scan(range).rev().try_collect()?
                    } else {
                        self.engine.scan(range).try_collect()?
                    };
                    for (key, value) in items {
                        let fmtkv = format::Raw::key_value(&key, &value);
                        writeln!(output, "{fmtkv}")?;
                    }
                }

                // scan_prefix PREFIX
                "scan_prefix" => {
                    let mut args = command.consume_args();
                    let prefix = decode_binary(&args.next_pos().ok_or("prefix not given")?.value);
                    args.reject_rest()?;
                    let mut scan = self.engine.scan_prefix(&prefix);
                    while let Some((key, value)) = scan.next().transpose()? {
                        let fmtkv = format::Raw::key_value(&key, &value);
                        writeln!(output, "{fmtkv}")?;
                    }
                }

                // set KEY=VALUE
                "set" => {
                    let mut args = command.consume_args();
                    let kv = args.next_key().ok_or("key=value not given")?.clone();
                    let key = decode_binary(&kv.key.unwrap());
                    let value = decode_binary(&kv.value);
                    args.reject_rest()?;
                    self.engine.set(&key, value)?;
                }

                // status
                "status" => {
                    command.consume_args().reject_rest()?;
                    writeln!(output, "{:#?}", self.engine.status()?)?;
                }

                name => return Err(format!("invalid command {name}").into()),
            }
            Ok(output)
        }
    }

    /// Decodes a raw byte vector from a Unicode string. Code points in the
    /// range U+0080 to U+00FF are converted back to bytes 0x80 to 0xff.
    /// This allows using e.g. \xff in the input string literal, and getting
    /// back a 0xff byte in the byte vector. Otherwise, char(0xff) yields
    /// the UTF-8 bytes 0xc3bf, which is the U+00FF code point as UTF-8.
    /// These characters are effectively represented as ISO-8859-1 rather
    /// than UTF-8, but it allows precise use of the entire u8 value range.
    pub fn decode_binary(s: &str) -> Vec<u8> {
        let mut buf = [0; 4];
        let mut bytes = Vec::new();
        for c in s.chars() {
            // u32 is the Unicode code point, not the UTF-8 encoding.
            match c as u32 {
                b @ 0x80..=0xff => bytes.push(b as u8),
                _ => bytes.extend(c.encode_utf8(&mut buf).as_bytes()),
            }
        }
        bytes
    }

    /// Parses an binary key range, using Rust range syntax.
    pub fn parse_key_range(s: &str) -> StdResult<impl RangeBounds<Vec<u8>>, Box<dyn StdError>> {
        let mut bound = (Bound::<Vec<u8>>::Unbounded, Bound::<Vec<u8>>::Unbounded);
        let re = Regex::new(r"^(\S+)?\.\.(=)?(\S+)?").expect("invalid regex");
        let groups = re.captures(s).ok_or_else(|| format!("invalid range {s}"))?;
        if let Some(start) = groups.get(1) {
            bound.0 = Bound::Included(decode_binary(start.as_str()));
        }
        if let Some(end) = groups.get(3) {
            let end = decode_binary(end.as_str());
            if groups.get(2).is_some() {
                bound.1 = Bound::Included(end)
            } else {
                bound.1 = Bound::Excluded(end)
            }
        }
        Ok(bound)
    }

    /// Wraps another engine and emits write events to the given channel.
    pub struct Emit<E: Engine> {
        /// The wrapped engine.
        inner: E,
        /// Sends operation events.
        tx: Sender<Operation>,
    }

    /// An engine operation emitted by the Emit engine.
    pub enum Operation {
        Delete { key: Vec<u8> },
        Flush,
        Set { key: Vec<u8>, value: Vec<u8> },
    }

    impl<E: Engine> Emit<E> {
        pub fn new(inner: E, tx: Sender<Operation>) -> Self {
            Self { inner, tx }
        }
    }

    impl<E: Engine> Engine for Emit<E> {
        type ScanIterator<'a>
            = E::ScanIterator<'a>
        where
            E: 'a;

        fn flush(&mut self) -> Result<()> {
            self.inner.flush()?;
            self.tx.send(Operation::Flush)?;
            Ok(())
        }

        fn delete(&mut self, key: &[u8]) -> Result<()> {
            self.inner.delete(key)?;
            self.tx.send(Operation::Delete { key: key.to_vec() })?;
            Ok(())
        }

        fn get(&mut self, key: &[u8]) -> Result<Option<Vec<u8>>> {
            self.inner.get(key)
        }

        fn scan(&mut self, range: impl RangeBounds<Vec<u8>>) -> Self::ScanIterator<'_> {
            self.inner.scan(range)
        }

        fn scan_dyn(
            &mut self,
            range: (Bound<Vec<u8>>, Bound<Vec<u8>>),
        ) -> Box<dyn ScanIterator + '_> {
            Box::new(self.scan(range))
        }

        fn set(&mut self, key: &[u8], value: Vec<u8>) -> Result<()> {
            self.inner.set(key, value.clone())?;
            self.tx.send(Operation::Set { key: key.to_vec(), value })?;
            Ok(())
        }

        fn status(&mut self) -> Result<Status> {
            self.inner.status()
        }
    }

    /// An engine that wraps two others and mirrors operations across them,
    /// panicking if they produce different results. Engine implementations
    /// should not have any observable differences in behavior.
    pub struct Mirror<A: Engine, B: Engine> {
        pub a: A,
        pub b: B,
    }

    impl<A: Engine, B: Engine> Mirror<A, B> {
        pub fn new(a: A, b: B) -> Self {
            Self { a, b }
        }
    }

    impl<A: Engine, B: Engine> Engine for Mirror<A, B> {
        type ScanIterator<'a>
            = MirrorIterator<'a, A, B>
        where
            Self: Sized,
            A: 'a,
            B: 'a;

        fn delete(&mut self, key: &[u8]) -> Result<()> {
            self.a.delete(key)?;
            self.b.delete(key)
        }

        fn flush(&mut self) -> Result<()> {
            self.a.flush()?;
            self.b.flush()
        }

        fn get(&mut self, key: &[u8]) -> Result<Option<Vec<u8>>> {
            let a = self.a.get(key)?;
            let b = self.b.get(key)?;
            assert_eq!(a, b);
            Ok(a)
        }

        fn scan(&mut self, range: impl RangeBounds<Vec<u8>>) -> Self::ScanIterator<'_>
        where
            Self: Sized,
        {
            let a = self.a.scan((range.start_bound().cloned(), range.end_bound().cloned()));
            let b = self.b.scan(range);
            MirrorIterator { a, b }
        }

        fn scan_dyn(
            &mut self,
            range: (Bound<Vec<u8>>, Bound<Vec<u8>>),
        ) -> Box<dyn ScanIterator + '_> {
            let a = self.a.scan(range.clone());
            let b = self.b.scan(range);
            Box::new(MirrorIterator::<A, B> { a, b })
        }

        fn set(&mut self, key: &[u8], value: Vec<u8>) -> Result<()> {
            self.a.set(key, value.clone())?;
            self.b.set(key, value)
        }

        fn status(&mut self) -> Result<Status> {
            let a = self.a.status()?;
            let b = self.b.status()?;
            // Only some items are comparable.
            assert_eq!(a.keys, b.keys);
            assert_eq!(a.size, b.size);
            Ok(a)
        }
    }

    pub struct MirrorIterator<'a, A: Engine + 'a, B: Engine + 'a> {
        a: A::ScanIterator<'a>,
        b: B::ScanIterator<'a>,
    }

    impl<A: Engine, B: Engine> Iterator for MirrorIterator<'_, A, B> {
        type Item = Result<(Vec<u8>, Vec<u8>)>;

        fn next(&mut self) -> Option<Self::Item> {
            let a = self.a.next();
            let b = self.b.next();
            assert_eq!(a, b);
            a
        }
    }

    impl<A: Engine, B: Engine> DoubleEndedIterator for MirrorIterator<'_, A, B> {
        fn next_back(&mut self) -> Option<Self::Item> {
            let a = self.a.next_back();
            let b = self.b.next_back();
            assert_eq!(a, b);
            a
        }
    }
}


================================================
FILE: src/storage/memory.rs
================================================
use std::collections::BTreeMap;
use std::collections::btree_map::Range;
use std::ops::{Bound, RangeBounds};

use super::{Engine, Status};
use crate::error::Result;

/// An in-memory key-value storage engine using the Rust standard library's
/// B-tree implementation. Data is not persisted. Primarily for testing.
#[derive(Default)]
pub struct Memory(BTreeMap<Vec<u8>, Vec<u8>>);

impl Memory {
    /// Creates a new Memory key-value storage engine.
    pub fn new() -> Self {
        Self::default()
    }
}

impl Engine for Memory {
    type ScanIterator<'a> = ScanIterator<'a>;

    fn delete(&mut self, key: &[u8]) -> Result<()> {
        self.0.remove(key);
        Ok(())
    }

    fn flush(&mut self) -> Result<()> {
        Ok(())
    }

    fn get(&mut self, key: &[u8]) -> Result<Option<Vec<u8>>> {
        Ok(self.0.get(key).cloned())
    }

    fn scan(&mut self, range: impl RangeBounds<Vec<u8>>) -> Self::ScanIterator<'_> {
        ScanIterator(self.0.range(range))
    }

    fn scan_dyn(
        &mut self,
        range: (Bound<Vec<u8>>, Bound<Vec<u8>>),
    ) -> Box<dyn super::ScanIterator + '_> {
        Box::new(self.scan(range))
    }

    fn set(&mut self, key: &[u8], value: Vec<u8>) -> Result<()> {
        self.0.insert(key.to_vec(), value);
        Ok(())
    }

    fn status(&mut self) -> Result<Status> {
        Ok(Status {
            name: "memory".to_string(),
            keys: self.0.len() as u64,
            size: self.0.iter().map(|(k, v)| (k.len() + v.len()) as u64).sum(),
            disk_size: 0,
            live_disk_size: 0,
        })
    }
}

pub struct ScanIterator<'a>(Range<'a, Vec<u8>, Vec<u8>>);

impl Iterator for ScanIterator<'_> {
    type Item = Result<(Vec<u8>, Vec<u8>)>;

    fn next(&mut self) -> Option<Self::Item> {
        self.0.next().map(|(k, v)| Ok((k.clone(), v.clone())))
    }
}

impl DoubleEndedIterator for ScanIterator<'_> {
    fn next_back(&mut self) -> Option<Self::Item> {
        self.0.next_back().map(|(k, v)| Ok((k.clone(), v.clone())))
    }
}

/// Most storage tests are Goldenscripts under src/storage/testscripts.
#[cfg(test)]
mod tests {
    use std::path::Path;

    use test_each_file::test_each_path;

    use super::super::engine::test::Runner;
    use super::*;

    // Run common goldenscript tests in src/storage/testscripts/engine.
    test_each_path! { in "src/storage/testscripts/engine" as engine => test_goldenscript }

    // Also run Memory-specific tests in src/storage/testscripts/memory.
    test_each_path! { in "src/storage/testscripts/memory" as scripts => test_goldenscript }

    fn test_goldenscript(path: &Path) {
        goldenscript::run(&mut Runner::new(Memory::new()), path).expect("goldenscript failed")
    }
}


================================================
FILE: src/storage/mod.rs
================================================
//! Key/value storage engines, including an MVCC transaction layer. For details,
//! see the [`engine`], [`bitcask`], and [`mvcc`] module documentation.

pub mod bitcask;
pub mod engine;
pub mod memory;
pub mod mvcc;

pub use bitcask::BitCask;
pub use engine::{Engine, ScanIterator, Status};
pub use memory::Memory;


================================================
FILE: src/storage/mvcc.rs
================================================
//! This module implements MVCC (Multi-Version Concurrency Control), a widely
//! used method for ACID transactions and concurrency control. It allows
//! multiple concurrent transactions to access and modify the same dataset,
//! isolates them from each other, detects and handles conflicts, and commits
//! their writes atomically as a single unit. It uses an underlying storage
//! engine to store raw keys and values.
//!
//! VERSIONS
//! ========
//!
//! MVCC handles concurrency control by managing multiple historical versions of
//! keys, identified by a timestamp. Every write adds a new version at a higher
//! timestamp, with deletes having a special tombstone value. For example, the
//! keys a,b,c,d may have the following values at various logical timestamps (x
//! is tombstone):
//!
//! Time
//! 5
//! 4  a4          
//! 3      b3      x
//! 2            
//! 1  a1      c1  d1
//!    a   b   c   d   Keys
//!
//! A transaction t2 that started at T=2 will see the values a=a1, c=c1, d=d1. A
//! different transaction t5 running at T=5 will see a=a4, b=b3, c=c1.
//!
//! toyDB uses logical timestamps with a sequence number stored in
//! Key::NextVersion. Each new read-write transaction takes its timestamp from
//! the current value of Key::NextVersion and then increments the value for the
//! next transaction.
//!
//! ISOLATION
//! =========
//!
//! MVCC provides an isolation level called snapshot isolation. Briefly,
//! transactions see a consistent snapshot of the database state as of their
//! start time. Writes made by concurrent or subsequent transactions are never
//! visible to it. If two concurrent transactions write to the same key they
//! will conflict and one of them must retry. A transaction's writes become
//! atomically visible to subsequent transactions only when they commit, and are
//! rolled back on failure. Read-only transactions never conflict with other
//! transactions.
//!
//! Transactions write new versions at their timestamp, storing them as
//! Key::Version(key, version) => value. If a transaction writes to a key and
//! finds a newer version, it returns an error and the client must retry.
//!
//! Active (uncommitted) read-write transactions record their version in the
//! active set, stored as Key::Active(version). When new transactions begin, they
//! take a snapshot of this active set, and any key versions that belong to a
//! transaction in the active set are considered invisible (to anyone except that
//! transaction itself). Writes to keys that already have a past version in the
//! active set will also return an error.
//!
//! To commit, a transaction simply deletes its record in the active set. This
//! will immediately (and, crucially, atomically) make all of its writes visible
//! to subsequent transactions, but not ongoing ones. If the transaction is
//! cancelled and rolled back, it maintains a record of all keys it wrote as
//! Key::TxnWrite(version, key), so that it can find the corresponding versions
//! and delete them before removing itself from the active set.
//!
//! Consider the following example, where we have two ongoing transactions at
//! time T=2 and T=5, with some writes that are not yet committed marked in
//! parentheses.
//!
//! Active set: [2, 5]
//!
//! Time
//! 5 (a5)
//! 4  a4          
//! 3      b3      x
//! 2         (x)     (e2)
//! 1  a1      c1  d1
//!    a   b   c   d   e   Keys
//!
//! Here, t2 will see a=a1, d=d1, e=e2 (it sees its own writes). t5 will see
//! a=a5, b=b3, c=c1. t2 does not see any newer versions, and t5 does not see
//! the tombstone at c@2 nor the value e=e2, because version=2 is in its active
//! set.
//!
//! If t2 tries to write b=b2, it receives an error and must retry, because a
//! newer version exists. Similarly, if t5 tries to write e=e5, it receives an
//! error and must retry, because the version e=e2 is in its active set.
//!
//! To commit, t2 can remove itself from the active set. A new transaction t6
//! starting after the commit will then see c as deleted and e=e2. t5 will still
//! not see any of t2's writes, because it's still in its local snapshot of the
//! active set at the time it began.
//!
//! READ-ONLY AND TIME TRAVEL QUERIES
//! =================================
//!
//! Since MVCC stores historical versions, it can trivially support time travel
//! queries where a transaction reads at a past timestamp and has a consistent
//! view of the database at that time.
//!
//! This is done by a transaction simply using a past version, as if it had
//! started far in the past, ignoring newer versions like any other transaction.
//! This transaction cannot write, as it does not have a unique timestamp (the
//! original read-write transaction originally owned this timestamp).
//!
//! The only wrinkle is that the time-travel query must also know what the active
//! set was at that version. Otherwise, it may see past transactions that committed
//! after that time, which were not visible to the original transaction that wrote
//! at that version. Similarly, if a time-travel query reads at a version that is
//! still active, it should not see its in-progress writes, and after it commits
//! a different time-travel query should not see those writes either, to maintain
//! version consistency.
//!
//! To achieve this, every read-write transaction stores its active set snapshot
//! in the storage engine as well, as Key::TxnActiveSnapshot, such that later
//! time-travel queries can restore its original snapshot. Furthermore, a
//! time-travel query can only see versions below the snapshot version, otherwise
//! it could see spurious in-progress or since-committed versions.
//!
//! In the following example, a time-travel query at version=3 would see a=a1,
//! c=c1, d=d1.
//!
//! Time
//! 5
//! 4  a4          
//! 3      b3      x
//! 2            
//! 1  a1      c1  d1
//!    a   b   c   d   Keys
//!
//! Read-only queries work similarly to time-travel queries, with one exception:
//! they read at the next (current) version, i.e. Key::NextVersion, and use the
//! current active set, storing the snapshot in memory only. Read-only queries
//! do not increment the version sequence number in Key::NextVersion.
//!
//! GARBAGE COLLECTION
//! ==================
//!
//! Normally, old versions would be garbage collected regularly, when they are
//! no longer needed by active transactions or time-travel queries. However,
//! toyDB does not implement garbage collection, instead keeping all history
//! forever, both out of laziness and also because it allows unlimited time
//! travel queries (it's a feature, not a bug!).

use std::borrow::Cow;
use std::collections::{BTreeSet, VecDeque};
use std::ops::{Bound, RangeBounds};
use std::sync::{Arc, Mutex, MutexGuard};

use itertools::Itertools as _;
use serde::{Deserialize, Serialize};

use super::engine::{self, Engine};
use crate::encoding::{self, Key as _, Value as _, bincode, keycode};
use crate::error::{Error, Result};
use crate::{errdata, errinput};

/// An MVCC version represents a logical timestamp. Each version belongs to a
/// separate read/write transaction. The latest version is incremented when a
/// new read-write transaction begins.
pub type Version = u64;

impl encoding::Value for Version {}

/// MVCC keys, using the Keycode encoding which preserves the ordering and
/// grouping of keys.
///
/// Cow byte slices allow encoding borrowed values and decoding owned values.
#[derive(Debug, Deserialize, Serialize)]
pub enum Key<'a> {
    /// The next available version.
    NextVersion,
    /// Active (uncommitted) transactions by version.
    TxnActive(Version),
    /// A snapshot of the active set at each version. Only written for
    /// versions where the active set is non-empty (excluding itself).
    TxnActiveSnapshot(Version),
    /// Keeps track of all keys written to by an active transaction (identified
    /// by its version), in case it needs to roll back.
    TxnWrite(
        Version,
        #[serde(with = "serde_bytes")]
        #[serde(borrow)]
        Cow<'a, [u8]>,
    ),
    /// A versioned key/value pair.
    Version(
        #[serde(with = "serde_bytes")]
        #[serde(borrow)]
        Cow<'a, [u8]>,
        Version,
    ),
    /// Unversioned non-transactional key/value pairs, mostly used for metadata.
    /// These exist separately from versioned keys, i.e. the unversioned key
    /// "foo" is entirely independent of the versioned key "foo@7".
    Unversioned(
        #[serde(with = "serde_bytes")]
        #[serde(borrow)]
        Cow<'a, [u8]>,
    ),
}

impl<'a> encoding::Key<'a> for Key<'a> {}

/// MVCC key prefixes, for prefix scans. These must match the keys above,
/// including the enum variant index.
#[derive(Debug, Deserialize, Serialize)]
enum KeyPrefix<'a> {
    NextVersion,
    TxnActive,
    TxnActiveSnapshot,
    TxnWrite(Version),
    Version(
        #[serde(with = "serde_bytes")]
        #[serde(borrow)]
        Cow<'a, [u8]>,
    ),
    Unversioned,
}

impl<'a> encoding::Key<'a> for KeyPrefix<'a> {}

/// An MVCC-based transactional key-value engine. It wraps an underlying storage
/// engine that's used for raw key/value storage.
///
/// While it supports any number of concurrent transactions, individual read or
/// write operations are executed sequentially, serialized via a mutex. There
/// are two reasons for this: the storage engine itself is not thread-safe,
/// requiring serialized access, and the Raft state machine that manages the
/// MVCC engine applies commands one at a time from the Raft log, which will
/// serialize them anyway.
pub struct MVCC<E: Engine> {
    pub engine: Arc<Mutex<E>>,
}

impl<E: Engine> MVCC<E> {
    /// Creates a new MVCC engine with the given storage engine.
    pub fn new(engine: E) -> Self {
        Self { engine: Arc::new(Mutex::new(engine)) }
    }

    /// Begins a new read-write transaction.
    pub fn begin(&self) -> Result<Transaction<E>> {
        Transaction::begin(self.engine.clone())
    }

    /// Begins a new read-only transaction at the latest version.
    pub fn begin_read_only(&self) -> Result<Transaction<E>> {
        Transaction::begin_read_only(self.engine.clone(), None)
    }

    /// Begins a new read-only transaction as of the given version.
    pub fn begin_as_of(&self, version: Version) -> Result<Transaction<E>> {
        Transaction::begin_read_only(self.engine.clone(), Some(version))
    }

    /// Resumes a transaction from the given transaction state.
    pub fn resume(&self, state: TransactionState) -> Result<Transaction<E>> {
        Transaction::resume(self.engine.clone(), state)
    }

    /// Fetches the value of an unversioned key.
    pub fn get_unversioned(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
        self.engine.lock()?.get(&Key::Unversioned(key.into()).encode())
    }

    /// Sets the value of an unversioned key.
    pub fn set_unversioned(&self, key: &[u8], value: Vec<u8>) -> Result<()> {
        self.engine.lock()?.set(&Key::Unversioned(key.into()).encode(), value)
    }

    /// Returns the status of the MVCC and storage engines.
    pub fn status(&self) -> Result<Status> {
        let mut engine = self.engine.lock()?;
        let versions = match engine.get(&Key::NextVersion.encode())? {
            Some(ref v) => Version::decode(v)? - 1,
            None => 0,
        };
        let active_txns = engine.scan_prefix(&KeyPrefix::TxnActive.encode()).count() as u64;
        Ok(Status { versions, active_txns, storage: engine.status()? })
    }
}

/// MVCC engine status.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct Status {
    /// The total number of MVCC versions (i.e. read-write transactions).
    pub versions: u64,
    /// Number of currently active transactions.
    pub active_txns: u64,
    /// The storage engine.
    pub storage: super::engine::Status,
}

impl encoding::Value for Status {}

/// An MVCC transaction.
pub struct Transaction<E: Engine> {
    /// The underlying engine, shared by all transactions.
    engine: Arc<Mutex<E>>,
    /// The transaction state.
    state: TransactionState,
}

/// A Transaction's state, which determines its write version and isolation. It
/// is separate from Transaction to allow it to be passed around independently
/// of the engine. There are two main motivations for this:
///
/// * It can be exported via Transaction.state(), (de)serialized, and later used
///   to instantiate a new functionally equivalent Transaction via
///   Transaction::resume(). This allows passing the transaction between the
///   storage engine and SQL engine (potentially running on a different node)
///   across the Raft state machine boundary.
///
/// * It can be borrowed independently of Engine, allowing references to it
///   in VisibleIterator, which would otherwise result in self-references.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct TransactionState {
    /// The version this transaction is running at. Only one read-write
    /// transaction can run at a given version, since this identifies its
    /// writes.
    pub version: Version,
    /// If true, the transaction is read only.
    pub read_only: bool,
    /// The set of concurrent active (uncommitted) transactions, as of the start
    /// of this transaction. Their writes should be invisible to this
    /// transaction even if they're writing at a lower version, since they're
    /// not committed yet. Uses a BTreeSet for test determinism.
    pub active: BTreeSet<Version>,
}

impl encoding::Value for TransactionState {}

impl TransactionState {
    /// Checks whether the given version is visible to this transaction.
    ///
    /// Future versions, and versions belonging to active transactions as of
    /// the start of this transaction, are never visible.
    ///
    /// Read-write transactions see their own writes at their version.
    ///
    /// Read-only queries only see versions below the transaction's version,
    /// excluding the version itself. This is to ensure time-travel queries see
    /// a consistent version both before and after any active transaction at
    /// that version commits its writes. See the module documentation for
    /// details.
    fn is_visible(&self, version: Version) -> bool {
        if self.active.contains(&version) {
            false
        } else if self.read_only {
            version < self.version
        } else {
            version <= self.version
        }
    }
}

impl From<TransactionState> for Cow<'_, TransactionState> {
    fn from(txn: TransactionState) -> Self {
        Cow::Owned(txn)
    }
}

impl<'a> From<&'a TransactionState> for Cow<'a, TransactionState> {
    fn from(txn: &'a TransactionState) -> Self {
        Cow::Borrowed(txn)
    }
}

impl<E: Engine> Transaction<E> {
    /// Begins a new transaction in read-write mode. This will allocate a new
    /// version that the transaction can write at, add it to the active set, and
    /// record its active snapshot for time-travel queries.
    fn begin(engine: Arc<Mutex<E>>) -> Result<Self> {
        let mut session = engine.lock()?;

        // Allocate a new version to write at.
        let version = match session.get(&Key::NextVersion.encode())? {
            Some(ref v) => Version::decode(v)?,
            None => 1,
        };
        session.set(&Key::NextVersion.encode(), (version + 1).encode())?;

        // Fetch the current set of active transactions, persist it for
        // time-travel queries if non-empty, then add this txn to it.
        let active = Self::scan_active(&mut session)?;
        if !active.is_empty() {
            session.set(&Key::TxnActiveSnapshot(version).encode(), active.encode())?
        }
        session.set(&Key::TxnActive(version).encode(), vec![])?;
        drop(session);

        Ok(Self { engine, state: TransactionState { version, read_only: false, active } })
    }

    /// Begins a new read-only transaction. If version is given it will see the
    /// state as of the beginning of that version (ignoring writes at that
    /// version). In other words, it sees the same state as the read-write
    /// transaction at that version saw when it began.
    fn begin_read_only(engine: Arc<Mutex<E>>, as_of: Option<Version>) -> Result<Self> {
        let mut session = engine.lock()?;

        // Fetch the latest version.
        let mut version = match session.get(&Key::NextVersion.encode())? {
            Some(ref v) => Version::decode(v)?,
            None => 1,
        };

        // If requested, create the transaction as of a past version, restoring
        // the active snapshot as of the beginning of that version. Otherwise,
        // use the latest version and get the current, real-time snapshot.
        let mut active = BTreeSet::new();
        if let Some(as_of) = as_of {
            if as_of >= version {
                return errinput!("version {as_of} does not exist");
            }
            version = as_of;
            if let Some(value) = session.get(&Key::TxnActiveSnapshot(version).encode())? {
                active = BTreeSet::<Version>::decode(&value)?;
            }
        } else {
            active = Self::scan_active(&mut session)?;
        }

        drop(session);

        Ok(Self { engine, state: TransactionState { version, read_only: true, active } })
    }

    /// Resumes a transaction from the given state.
    fn resume(engine: Arc<Mutex<E>>, s: TransactionState) -> Result<Self> {
        // For read-write transactions, verify that the transaction is still
        // active before making further writes.
        if !s.read_only && engine.lock()?.get(&Key::TxnActive(s.version).encode())?.is_none() {
            return errinput!("no active transaction at version {}", s.version);
        }
        Ok(Self { engine, state: s })
    }

    /// Fetches the set of currently active transactions.
    fn scan_active(session: &mut MutexGuard<E>) -> Result<BTreeSet<Version>> {
        let mut active = BTreeSet::new();
        let mut scan = session.scan_prefix(&KeyPrefix::TxnActive.encode());
        while let Some((key, _)) = scan.next().transpose()? {
            match Key::decode(&key)? {
                Key::TxnActive(version) => active.insert(version),
                key => return errdata!("expected TxnActive key, got {key:?}"),
            };
        }
        Ok(active)
    }

    /// Returns the version the transaction is running at.
    pub fn version(&self) -> Version {
        self.state.version
    }

    /// Returns whether the transaction is read-only.
    pub fn read_only(&self) -> bool {
        self.state.read_only
    }

    /// Returns the transaction's state. This can be used to instantiate a
    /// functionally equivalent transaction via resume().
    pub fn state(&self) -> &TransactionState {
        &self.state
    }

    /// Commits the transaction, by removing it from the active set. This will
    /// immediately make its writes visible to subsequent transactions. Also
    /// removes its TxnWrite records, which are no longer needed.
    ///
    /// NB: commit does not flush writes to durable storage, since we rely on
    /// the Raft log for persistence.
    pub fn commit(self) -> Result<()> {
        if self.state.read_only {
            return Ok(());
        }
        let mut engine = self.engine.lock()?;
        let remove: Vec<_> = engine
            .scan_prefix(&KeyPrefix::TxnWrite(self.state.version).encode())
            .map_ok(|(k, _)| k)
            .try_collect()?;
        for key in remove {
            engine.delete(&key)?
        }
        engine.delete(&Key::TxnActive(self.state.version).encode())
    }

    /// Rolls back the transaction, by undoing all written versions and removing
    /// it from the active set. The active set snapshot is left behind, since
    /// this is needed for time travel queries at this version.
    pub fn rollback(self) -> Result<()> {
        if self.state.read_only {
            return Ok(());
        }
        let mut engine = self.engine.lock()?;
        let mut rollback = Vec::new();
        let mut scan = engine.scan_prefix(&KeyPrefix::TxnWrite(self.state.version).encode());
        while let Some((key, _)) = scan.next().transpose()? {
            match Key::decode(&key)? {
                Key::TxnWrite(_, key) => {
                    rollback.push(Key::Version(key, self.state.version).encode())
                    // the version
                }
                key => return errdata!("expected TxnWrite, got {key:?}"),
            };
            rollback.push(key); // the TxnWrite record
        }
        drop(scan);
        for key in rollback.into_iter() {
            engine.delete(&key)?;
        }
        engine.delete(&Key::TxnActive(self.state.version).encode()) // remove from active set
    }

    /// Deletes a key.
    pub fn delete(&self, key: &[u8]) -> Result<()> {
        self.write_version(key, None)
    }

    /// Sets a value for a key.
    pub fn set(&self, key: &[u8], value: Vec<u8>) -> Result<()> {
        self.write_version(key, Some(value))
    }

    /// Writes a new version for a key at the transaction's version. None writes
    /// a deletion tombstone. If a write conflict is found (either a newer or
    /// uncommitted version), a serialization error is returned.  Replacing our
    /// own uncommitted write is fine.
    fn write_version(&self, key: &[u8], value: Option<Vec<u8>>) -> Result<()> {
        if self.state.read_only {
            return Err(Error::ReadOnly);
        }
        let mut engine = self.engine.lock()?;

        // Check for write conflicts, i.e. if the latest key is invisible to us
        // (either a newer version, or an uncommitted version in our past). We
        // can only conflict with the latest key, since all transactions enforce
        // the same invariant.
        let from = Key::Version(
            key.into(),
            self.state.active.first().copied().unwrap_or(self.state.version + 1),
        )
        .encode();
        let to = Key::Version(key.into(), u64::MAX).encode();
        if let Some((key, _)) = engine.scan(from..=to).last().transpose()? {
            match Key::decode(&key)? {
                Key::Version(_, version) => {
                    if !self.state.is_visible(version) {
                        return Err(Error::Serialization);
                    }
                }
                key => return errdata!("expected Key::Version got {key:?}"),
            }
        }

        // Write the new version and its write record.
        //
        // NB: TxnWrite contains the provided user key, not the encoded engine
        // key, since we can construct the engine key using the version.
        engine.set(&Key::TxnWrite(self.state.version, key.into()).encode(), vec![])?;
        engine
            .set(&Key::Version(key.into(), self.state.version).encode(), bincode::serialize(&value))
    }

    /// Fetches a key's value, or None if it does not exist.
    pub fn get(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
        let mut engine = self.engine.lock()?;
        let from = Key::Version(key.into(), 0).encode();
        let to = Key::Version(key.into(), self.state.version).encode();
        let mut scan = engine.scan(from..=to).rev();
        while let Some((key, value)) = scan.next().transpose()? {
            match Key::decode(&key)? {
                Key::Version(_, version) => {
                    if self.state.is_visible(version) {
                        return bincode::deserialize(&value);
                    }
                }
                key => return errdata!("expected Key::Version got {key:?}"),
            };
        }
        Ok(None)
    }

    /// Returns an iterator over the latest visible key/value pairs at the
    /// transaction's version.
    pub fn scan(&self, range: impl RangeBounds<Vec<u8>>) -> ScanIterator<E> {
        let start = match range.start_bound() {
            Bound::Excluded(k) => Bound::Excluded(Key::Version(k.into(), u64::MAX).encode()),
            Bound::Included(k) => Bound::Included(Key::Version(k.into(), 0).encode()),
            Bound::Unbounded => Bound::Included(Key::Version(vec![].into(), 0).encode()),
        };
        let end = match range.end_bound() {
            Bound::Excluded(k) => Bound::Excluded(Key::Version(k.into(), 0).encode()),
            Bound::Included(k) => Bound::Included(Key::Version(k.into(), u64::MAX).encode()),
            Bound::Unbounded => Bound::Excluded(KeyPrefix::Unversioned.encode()),
        };
        ScanIterator::new(self.engine.clone(), self.state().clone(), (start, end))
    }

    /// Scans keys under a given prefix.
    pub fn scan_prefix(&self, prefix: &[u8]) -> ScanIterator<E> {
        // Normally, KeyPrefix::Version will only match all versions of the
        // exact given key. We want all keys maching the prefix, so we chop off
        // the Keycode byte slice terminator 0x0000 at the end.
        let mut prefix = KeyPrefix::Version(prefix.into()).encode();
        prefix.truncate(prefix.len() - 2);
        let range = keycode::prefix_range(&prefix);
        ScanIterator::new(self.engine.clone(), self.state().clone(), range)
    }
}

/// An iterator over the latest live and visible key/value pairs for the txn.
///
/// The (single-threaded) engine is shared via mutex, and holding the mutex for
/// the lifetime of the iterator can cause deadlocks (e.g. when the local SQL
/// engine pulls from two tables concurrently during a join). Instead, we pull
/// and buffer a batch of rows at a time, and release the mutex in between.
///
/// This does not implement DoubleEndedIterator (reverse scans), since the SQL
/// layer doesn't currently need it.
pub struct ScanIterator<E: Engine> {
    /// The engine.
    engine: Arc<Mutex<E>>,
    /// The transaction state.
    txn: TransactionState,
    /// A buffer of live and visible key/value pairs to emit.
    buffer: VecDeque<(Vec<u8>, Vec<u8>)>,
    /// The remaining range after the buffer.
    remainder: Option<(Bound<Vec<u8>>, Bound<Vec<u8>>)>,
}

/// Implement [`Clone`] manually. `derive(Clone)` isn't smart enough to figure
/// out that we don't need `Engine: Clone` when it's in an [`Arc`]. See:
/// <https://github.com/rust-lang/rust/issues/26925>.
impl<E: Engine> Clone for ScanIterator<E> {
    fn clone(&self) -> Self {
        Self {
            engine: self.engine.clone(),
            txn: self.txn.clone(),
            buffer: self.buffer.clone(),
            remainder: self.remainder.clone(),
        }
    }
}

impl<E: Engine> ScanIterator<E> {
    /// The number of live key/value pairs to pull from the engine each time we
    /// lock it. Uses 2 in tests to exercise the buffering code.
    const BUFFER_SIZE: usize = if cfg!(test) { 2 } else { 32 };

    /// Creates a new scan iterator.
    fn new(
        engine: Arc<Mutex<E>>,
        txn: TransactionState,
        range: (Bound<Vec<u8>>, Bound<Vec<u8>>),
    ) -> Self {
        let buffer = VecDeque::with_capacity(Self::BUFFER_SIZE);
        Self { engine, txn, buffer, remainder: Some(range) }
    }

    /// Fills the buffer, if there's any pending items.
    fn fill_buffer(&mut self) -> Result<()> {
        // Check if there's anything to buffer.
        if self.buffer.len() >= Self::BUFFER_SIZE {
            return Ok(());
        }
        let Some(range) = self.remainder.take() else {
            return Ok(());
        };
        let range_end = range.1.clone();

        let mut engine = self.engine.lock()?;
        let mut iter = VersionIterator::new(&self.txn, engine.scan(range)).peekable();
        while let Some((key, _, value)) = iter.next().transpose()? {
            // If the next key equals this one, we're not at the latest version.
            match iter.peek() {
                Some(Ok((next, _, _))) if next == &key => continue,
                Some(Err(err)) => return Err(err.clone()),
                Some(Ok(_)) | None => {}
            }

            // Decode the value, and skip deleted keys (tombstones).
            let Some(value) = bincode::deserialize(&value)? else { continue };
            self.buffer.push_back((key, value));

            // If we filled the buffer, save the remaining range (if any) and
            // return. peek() has already buffered next(), so pull it.
            if self.buffer.len() == Self::BUFFER_SIZE {
                if let Some((next, version, _)) = iter.next().transpose()? {
                    // We have to re-encode it as a raw engine key, since we
                    // only have access to the decoded MVCC user key.
                    let range_start = Bound::Included(Key::Version(next.into(), version).encode());
                    self.remainder = Some((range_start, range_end));
                }
                return Ok(());
            }
        }
        Ok(())
    }
}

impl<E: Engine> Iterator for ScanIterator<E> {
    type Item = Result<(Vec<u8>, Vec<u8>)>;

    fn next(&mut self) -> Option<Self::Item> {
        if self.buffer.is_empty()
            && let Err(error) = self.fill_buffer()
        {
            return Some(Err(error));
        }
        self.buffer.pop_front().map(Ok)
    }
}

/// An iterator that decodes raw engine key/value pairs into MVCC key/value
/// versions, and skips invisible versions. Helper for ScanIterator.
struct VersionIterator<'a, I: engine::ScanIterator> {
    /// The transaction the scan is running in.
    txn: &'a TransactionState,
    /// The inner engine scan iterator.
    inner: I,
}

impl<'a, I: engine::ScanIterator> VersionIterator<'a, I> {
    /// Creates a new MVCC version iterator for the given engine iterator.
    fn new(txn: &'a TransactionState, inner: I) -> Self {
        Self { txn, inner }
    }

    // Fallible next(). Returns the next visible key/version/value tuple.
    fn try_next(&mut self) -> Result<Option<(Vec<u8>, Version, Vec<u8>)>> {
        while let Some((key, value)) = self.inner.next().transpose()? {
            let Key::Version(key, version) = Key::decode(&key)? else {
                return errdata!("expected Key::Version got {key:?}");
            };
            if !self.txn.is_visible(version) {
                continue;
            }
            return Ok(Some((key.into_owned(), version, value)));
        }
        Ok(None)
    }
}

impl<I: engine::ScanIterator> Iterator for VersionIterator<'_, I> {
    type Item = Result<(Vec<u8>, Version, Vec<u8>)>;

    fn next(&mut self) -> Option<Self::Item> {
        self.try_next().transpose()
    }
}

/// Most storage tests are Goldenscripts under src/storage/testscripts.
#[cfg(test)]
pub mod tests {
    use std::collections::HashMap;
    use std::error::Error;
    use std::fmt::Write as _;
    use std::path::Path;
    use std::result::Result;

    use crossbeam::channel::Receiver;
    use tempfile::TempDir;
    use test_case::test_case;
    use test_each_file::test_each_path;

    use super::*;
    use crate::encoding::format::{self, Formatter as _};
    use crate::storage::engine::test::{Emit, Mirror, Operation, decode_binary, parse_key_range};
    use crate::storage::{BitCask, Memory};

    // Run goldenscript tests in src/storage/testscripts/mvcc.
    test_each_path! { in "src/storage/testscripts/mvcc" as scripts => test_goldenscript }

    fn test_goldenscript(path: &Path) {
        goldenscript::run(&mut MVCCRunner::new(), path).expect("goldenscript failed")
    }

    /// Tests that key prefixes are actually prefixes of keys.
    #[test_case(KeyPrefix::NextVersion, Key::NextVersion; "NextVersion")]
    #[test_case(KeyPrefix::TxnActive, Key::TxnActive(1); "TxnActive")]
    #[test_case(KeyPrefix::TxnActiveSnapshot, Key::TxnActiveSnapshot(1); "TxnActiveSnapshot")]
    #[test_case(KeyPrefix::TxnWrite(1), Key::TxnWrite(1, b"foo".as_slice().into()); "TxnWrite")]
    #[test_case(KeyPrefix::Version(b"foo".as_slice().into()), Key::Version(b"foo".as_slice().into(), 1); "Version")]
    #[test_case(KeyPrefix::Unversioned, Key::Unversioned(b"foo".as_slice().into()); "Unversioned")]
    fn key_prefix(prefix: KeyPrefix, key: Key) {
        let prefix = prefix.encode();
        let key = key.encode();
        assert_eq!(prefix, key[..prefix.len()])
    }

    /// Runs MVCC goldenscript tests.
    pub struct MVCCRunner {
        mvcc: MVCC<TestEngine>,
        txns: HashMap<String, Transaction<TestEngine>>,
        op_rx: Receiver<Operation>,
        _tempdir: TempDir,
    }

    type TestEngine = Emit<Mirror<BitCask, Memory>>;

    impl MVCCRunner {
        fn new() -> Self {
            // Use both a BitCask and a Memory engine, and mirror operations
            // across them. Emit engine operations to op_rx.
            let (op_tx, op_rx) = crossbeam::channel::unbounded();
            let tempdir = TempDir::with_prefix("toydb").expect("tempdir failed");
            let bitcask = BitCask::new(tempdir.path().join("bitcask")).expect("bitcask failed");
            let memory = Memory::new();
            let engine = Emit::new(Mirror::new(bitcask, memory), op_tx);
            let mvcc = MVCC::new(engine);
            Self { mvcc, op_rx, txns: HashMap::new(), _tempdir: tempdir }
        }

        /// Fetches the named transaction from a command prefix.
        fn get_txn(
            &mut self,
            prefix: &Option<String>,
        ) -> Result<&'_ mut Transaction<TestEngine>, Box<dyn Error>> {
            let name = Self::txn_name(prefix)?;
            self.txns.get_mut(name).ok_or(format!("unknown txn {name}").into())
        }

        /// Fetches the txn name from a command prefix, or errors.
        fn txn_name(prefix: &Option<String>) -> Result<&str, Box<dyn Error>> {
            prefix.as_deref().ok_or("no txn name".into())
        }

        /// Errors if a txn prefix is given.
        fn no_txn(command: &goldenscript::Command) -> Result<(), Box<dyn Error>> {
            if let Some(name) = &command.prefix {
                return Err(format!("can't run {} with txn {name}", command.name).into());
            }
            Ok(())
        }
    }

    impl goldenscript::Runner for MVCCRunner {
        fn run(&mut self, command: &goldenscript::Command) -> Result<String, Box<dyn Error>> {
            let mut output = String::new();
            let mut tags = command.tags.clone();

            match command.name.as_str() {
                // txn: begin [readonly] [as_of=VERSION]
                "begin" => {
                    let name = Self::txn_name(&command.prefix)?;
                    if self.txns.contains_key(name) {
                        return Err(format!("txn {name} already exists").into());
                    }
                    let mut args = command.consume_args();
                    let readonly = match args.next_pos().map(|a| a.value.as_str()) {
                        Some("readonly") => true,
                        None => false,
                        Some(v) => return Err(format!("invalid argument {v}").into()),
                    };
                    let as_of = args.lookup_parse("as_of")?;
                    args.reject_rest()?;
                    let txn = match (readonly, as_of) {
                        (false, None) => self.mvcc.begin()?,
                        (true, None) => self.mvcc.begin_read_only()?,
                        (true, Some(v)) => self.mvcc.begin_as_of(v)?,
                        (false, Some(_)) => return Err("as_of only valid for read-only txn".into()),
                    };
                    self.txns.insert(name.to_string(), txn);
                }

                // txn: commit
                "commit" => {
                    let name = Self::txn_name(&command.prefix)?;
                    let txn = self.txns.remove(name).ok_or(format!("unknown txn {name}"))?;
                    command.consume_args().reject_rest()?;
                    txn.commit()?;
                }

                // txn: delete KEY...
                "delete" => {
                    let txn = self.get_txn(&command.prefix)?;
                    let mut args = command.consume_args();
                    for arg in args.rest_pos() {
                        let key = decode_binary(&arg.value);
                        txn.delete(&key)?;
                    }
                    args.reject_rest()?;
                }

                // dump
                "dump" => {
                    command.consume_args().reject_rest()?;
                    let mut engine = self.mvcc.engine.lock().unwrap();
                    let mut scan = engine.scan(..);
                    while let Some((key, value)) = scan.next().transpose()? {
                        let fmtkv = format::MVCC::<format::Raw>::key_value(&key, &value);
                        let rawkv = format::Raw::key_value(&key, &value);
                        writeln!(output, "{fmtkv} [{rawkv}]")?;
                    }
                }

                // txn: get KEY...
                "get" => {
                    let txn = self.get_txn(&command.prefix)?;
                    let mut args = command.consume_args();
                    for arg in args.rest_pos() {
                        let key = decode_binary(&arg.value);
                        let value = txn.get(&key)?;
                        let fmtkv = format::Raw::key_maybe_value(&key, value.as_deref());
                        writeln!(output, "{fmtkv}")?;
                    }
                    args.reject_rest()?;
                }

                // get_unversioned KEY...
                "get_unversioned" => {
                    Self::no_txn(command)?;
                    let mut args = command.consume_args();
                    for arg in args.rest_pos() {
                        let key = decode_binary(&arg.value);
                        let value = self.mvcc.get_unversioned(&key)?;
                        let fmtkv = format::Raw::key_maybe_value(&key, value.as_deref());
                        writeln!(output, "{fmtkv}")?;
                    }
                    args.reject_rest()?;
                }

                // import [VERSION] KEY=VALUE...
                "import" => {
                    Self::no_txn(command)?;
                    let mut args = command.consume_args();
                    let version = args.next_pos().map(|a| a.parse()).transpose()?;
                    let mut txn = self.mvcc.begin()?;
                    if let Some(version) = version {
                        if txn.version() > version {
                            return Err(format!("version {version} already used").into());
                        }
                        while txn.version() < version {
                            txn = self.mvcc.begin()?;
                        }
                    }
                    for kv in args.rest_key() {
                        let key = decode_binary(kv.key.as_ref().unwrap());
                        let value = decode_binary(&kv.value);
                        if value.is_empty() {
                            txn.delete(&key)?;
                        } else {
                            txn.set(&key, value)?;
                        }
                    }
                    args.reject_rest()?;
                    txn.commit()?;
                }

                // txn: resume JSON
                "resume" => {
                    let name = Self::txn_name(&command.prefix)?;
                    let mut args = command.consume_args();
                    let raw = &args.next_pos().ok_or("state not given")?.value;
                    args.reject_rest()?;
                    let state: TransactionState = serde_json::from_str(raw)?;
                    let txn = self.mvcc.resume(state)?;
                    self.txns.insert(name.to_string(), txn);
                }

                // txn: rollback
                "rollback" => {
                    let name = Self::txn_name(&command.prefix)?;
                    let txn = self.txns.remove(name).ok_or(format!("unknown txn {name}"))?;
                    command.consume_args().reject_rest()?;
                    txn.rollback()?;
                }

                // txn: scan [RANGE]
                "scan" => {
                    let txn = self.get_txn(&command.prefix)?;
                    let mut args = command.consume_args();
                    let range =
                        parse_key_range(args.next_pos().map(|a| a.value.as_str()).unwrap_or(".."))?;
                    args.reject_rest()?;

                    let kvs: Vec<_> = txn.scan(range).try_collect()?;
                    for (key, value) in kvs {
                        writeln!(output, "{}", format::Raw::key_value(&key, &value))?;
                    }
                }

                // txn: scan_prefix PREFIX
                "scan_prefix" => {
                    let txn = self.get_txn(&command.prefix)?;
                    let mut args = command.consume_args();
                    let prefix = decode_binary(&args.next_pos().ok_or("prefix not given")?.value);
                    args.reject_rest()?;

                    let kvs: Vec<_> = txn.scan_prefix(&prefix).try_collect()?;
                    for (key, value) in kvs {
                        writeln!(output, "{}", format::Raw::key_value(&key, &value))?;
                    }
                }

                // txn: set KEY=VALUE...
                "set" => {
                    let txn = self.get_txn(&command.prefix)?;
                    let mut args = command.consume_args();
                    for kv in args.rest_key() {
                        let key = decode_binary(kv.key.as_ref().unwrap());
                        let value = decode_binary(&kv.value);
                        txn.set(&key, value)?;
                    }
                    args.reject_rest()?;
                }

                // set_unversioned KEY=VALUE...
                "set_unversioned" => {
                    Self::no_txn(command)?;
                    let mut args = command.consume_args();
                    for kv in args.rest_key() {
                        let key = decode_binary(kv.key.as_ref().unwrap());
                        let value = decode_binary(&kv.value);
                        self.mvcc.set_unversioned(&key, value)?;
                    }
                    args.reject_rest()?;
                }

                // txn: state
                "state" => {
                    command.consume_args().reject_rest()?;
                    let txn = self.get_txn(&command.prefix)?;
                    let state = txn.state();
                    write!(
                        output,
                        "v{} {} active={{{}}}",
                        state.version,
                        if state.read_only { "ro" } else { "rw" },
                        state.active.iter().sorted().join(",")
                    )?;
                }

                // status
                "status" => writeln!(output, "{:#?}", self.mvcc.status()?)?,

                name => return Err(format!("invalid command {name}").into()),
            }

            // If requested, output engine operations.
            if tags.remove("ops") {
                while let Ok(op) = self.op_rx.try_recv() {
                    match op {
                        Operation::Delete { key } => {
                            let fmtkey = format::MVCC::<format::Raw>::key(&key);
                            let rawkey = format::Raw::key(&key);
                            writeln!(output, "engine delete {fmtkey} [{rawkey}]")?
                        }
                        Operation::Flush => writeln!(output, "engine flush")?,
                        Operation::Set { key, value } => {
                            let fmtkv = format::MVCC::<format::Raw>::key_value(&key, &value);
                            let rawkv = format::Raw::key_value(&key, &value);
                            writeln!(output, "engine set {fmtkv} [{rawkv}]")?
                        }
                    }
                }
            }

            if let Some(tag) = tags.iter().next() {
                return Err(format!("unknown tag {tag}").into());
            }

            Ok(output)
        }

        // Drain unhandled engine operations.
        fn end_command(&mut self, _: &goldenscript::Command) -> Result<String, Box<dyn Error>> {
            while self.op_rx.try_recv().is_ok() {}
            Ok(String::new())
        }
    }
}


================================================
FILE: src/storage/testscripts/bitcask/compact
================================================
# Tests compaction.

# Write some initial data out of order, with some overwrites and deletes.
set foo=bar
set b=1
set b=2
set e=5
delete e
set c=0
delete c
set c=3
set ""=""
set a=1
delete f
delete d
set d=4
scan
---
"" → ""
"a" → "1"
"b" → "2"
"c" → "3"
"d" → "4"
"foo" → "bar"

# Show status.
status
---
Status {
    name: "bitcask",
    keys: 6,
    size: 14,
    disk_size: 128,
    live_disk_size: 62,
}

# Dump the log.
dump
---
0@0     keylen=3 [00000003] valuelen=3 [00000003]
14b     key="foo" [666f6f] value="bar" [626172]
--------
1@14    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="b" [62] value="1" [31]
--------
2@24    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="b" [62] value="2" [32]
--------
3@34    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="e" [65] value="5" [35]
--------
4@44    keylen=1 [00000001] valuelen=-1 [ffffffff]
9b      key="e" [65] tombstone
--------
5@53    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="c" [63] value="0" [30]
--------
6@63    keylen=1 [00000001] valuelen=-1 [ffffffff]
9b      key="c" [63] tombstone
--------
7@72    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="c" [63] value="3" [33]
--------
8@82    keylen=0 [00000000] valuelen=0 [00000000]
8b      key="" [] value="" []
--------
9@90    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="a" [61] value="1" [31]
--------
10@100  keylen=1 [00000001] valuelen=-1 [ffffffff]
9b      key="f" [66] tombstone
--------
11@109  keylen=1 [00000001] valuelen=-1 [ffffffff]
9b      key="d" [64] tombstone
--------
12@118  keylen=1 [00000001] valuelen=1 [00000001]
10b     key="d" [64] value="4" [34]

# Compact it.
compact
---
ok

# Scan should still give same results.
scan
---
"" → ""
"a" → "1"
"b" → "2"
"c" → "3"
"d" → "4"
"foo" → "bar"

# Status should show no garbage.
status
---
Status {
    name: "bitcask",
    keys: 6,
    size: 14,
    disk_size: 62,
    live_disk_size: 62,
}

# Dump the compacted log.
dump
---
0@0     keylen=0 [00000000] valuelen=0 [00000000]
8b      key="" [] value="" []
--------
1@8     keylen=1 [00000001] valuelen=1 [00000001]
10b     key="a" [61] value="1" [31]
--------
2@18    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="b" [62] value="2" [32]
--------
3@28    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="c" [63] value="3" [33]
--------
4@38    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="d" [64] value="4" [34]
--------
5@48    keylen=3 [00000003] valuelen=3 [00000003]
14b     key="foo" [666f6f] value="bar" [626172]

# Reopening the file works and shows the same data.
reopen
scan
---
"" → ""
"a" → "1"
"b" → "2"
"c" → "3"
"d" → "4"
"foo" → "bar"


================================================
FILE: src/storage/testscripts/bitcask/compact_open
================================================
# Tests that the log is auto-compacted on startup if the fraction of garbage
# exceeds the given threshold.

# Write some initial data out of order, with some overwrites and deletes.
set foo=bar
set b=1
set b=2
set e=5
delete e
set c=0
delete c
set c=3
set ""=""
set a=1
delete f
delete d
set d=4
scan
---
"" → ""
"a" → "1"
"b" → "2"
"c" → "3"
"d" → "4"
"foo" → "bar"

# Status shows the garbage fraction is 0.51.
status
---
Status {
    name: "bitcask",
    keys: 6,
    size: 14,
    disk_size: 128,
    live_disk_size: 62,
}

# Reopening with a garbage fraction of 0.6 does not compact.
reopen compact_fraction=0.6
status
---
Status {
    name: "bitcask",
    keys: 6,
    size: 14,
    disk_size: 128,
    live_disk_size: 62,
}

# Reopening with a fraction of 0.5 does compact.
reopen compact_fraction=0.5
status
---
Status {
    name: "bitcask",
    keys: 6,
    size: 14,
    disk_size: 62,
    live_disk_size: 62,
}

dump
---
0@0     keylen=0 [00000000] valuelen=0 [00000000]
8b      key="" [] value="" []
--------
1@8     keylen=1 [00000001] valuelen=1 [00000001]
10b     key="a" [61] value="1" [31]
--------
2@18    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="b" [62] value="2" [32]
--------
3@28    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="c" [63] value="3" [33]
--------
4@38    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="d" [64] value="4" [34]
--------
5@48    keylen=3 [00000003] valuelen=3 [00000003]
14b     key="foo" [666f6f] value="bar" [626172]


================================================
FILE: src/storage/testscripts/bitcask/log
================================================
# Assert the raw structure of the BitCask log.

# Write some initial data out of order, with some overwrites and deletes.
set foo=bar
set b=1
set b=2
set e=5
delete e
set c=0
delete c
set c=3
set ""=""
set a=1
delete f
delete d
set d=4
scan
---
"" → ""
"a" → "1"
"b" → "2"
"c" → "3"
"d" → "4"
"foo" → "bar"

# Dump the log.
dump
---
0@0     keylen=3 [00000003] valuelen=3 [00000003]
14b     key="foo" [666f6f] value="bar" [626172]
--------
1@14    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="b" [62] value="1" [31]
--------
2@24    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="b" [62] value="2" [32]
--------
3@34    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="e" [65] value="5" [35]
--------
4@44    keylen=1 [00000001] valuelen=-1 [ffffffff]
9b      key="e" [65] tombstone
--------
5@53    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="c" [63] value="0" [30]
--------
6@63    keylen=1 [00000001] valuelen=-1 [ffffffff]
9b      key="c" [63] tombstone
--------
7@72    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="c" [63] value="3" [33]
--------
8@82    keylen=0 [00000000] valuelen=0 [00000000]
8b      key="" [] value="" []
--------
9@90    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="a" [61] value="1" [31]
--------
10@100  keylen=1 [00000001] valuelen=-1 [ffffffff]
9b      key="f" [66] tombstone
--------
11@109  keylen=1 [00000001] valuelen=-1 [ffffffff]
9b      key="d" [64] tombstone
--------
12@118  keylen=1 [00000001] valuelen=1 [00000001]
10b     key="d" [64] value="4" [34]

# Reopen the log, which shows the same data.
reopen
scan
---
"" → ""
"a" → "1"
"b" → "2"
"c" → "3"
"d" → "4"
"foo" → "bar"

dump
---
0@0     keylen=3 [00000003] valuelen=3 [00000003]
14b     key="foo" [666f6f] value="bar" [626172]
--------
1@14    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="b" [62] value="1" [31]
--------
2@24    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="b" [62] value="2" [32]
--------
3@34    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="e" [65] value="5" [35]
--------
4@44    keylen=1 [00000001] valuelen=-1 [ffffffff]
9b      key="e" [65] tombstone
--------
5@53    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="c" [63] value="0" [30]
--------
6@63    keylen=1 [00000001] valuelen=-1 [ffffffff]
9b      key="c" [63] tombstone
--------
7@72    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="c" [63] value="3" [33]
--------
8@82    keylen=0 [00000000] valuelen=0 [00000000]
8b      key="" [] value="" []
--------
9@90    keylen=1 [00000001] valuelen=1 [00000001]
10b     key="a" [61] value="1" [31]
--------
10@100  keylen=1 [00000001] valuelen=-1 [ffffffff]
9b      key="f" [66] tombstone
--------
11@109  keylen=1 [00000001] valuelen=-1 [ffffffff]
9b      key="d" [64] tombstone
--------
12@118  keylen=1 [00000001] valuelen=1 [00000001]
10b     key="d" [64] value="4" [34]


================================================
FILE: src/storage/testscripts/bitcask/status
================================================
# Tests status for BitCask engine.

set foo=123
set bar=1
delete bar
set baz=1
set baz=2
set baz=3
delete qux
---
ok

scan
---
"baz" → "3"
"foo" → "123"

status
---
Status {
    name: "bitcask",
    keys: 2,
    size: 10,
    disk_size: 84,
    live_disk_size: 26,
}

# Compact the log and show status again.
compact
status
---
Status {
    name: "bitcask",
    keys: 2,
    size: 10,
    disk_size: 26,
    live_disk_size: 26,
}


================================================
FILE: src/storage/testscripts/engine/keys
================================================
# Tests various keys.

# Keys are case-sensitive.
set a=1
get a
get A
---
"a" → "1"
"A" → None

set A=2
get a
get A
---
"a" → "1"
"A" → "2"

delete a
delete A
scan
---
ok

# Empty keys and values are valid.
set ""=""
get ""
scan
delete ""
---
"" → ""
"" → ""

scan
---
ok

# NUL keys and values are valid.
set "\0"="\0"
get "\0"
scan
delete "\0"
---
"\x00" → "\x00"
"\x00" → "\x00"

scan
---
ok

# Unicode keys and values work, but are shown as raw UTF-8 bytes.
set "👋"="👋"
get "👋"
scan
delete "👋"
---
"\xf0\x9f\x91\x8b" → "\xf0\x9f\x91\x8b"
"\xf0\x9f\x91\x8b" → "\xf0\x9f\x91\x8b"

scan
---
ok


================================================
FILE: src/storage/testscripts/engine/point
================================================
# Tests basic point operations.

# Getting a missing key in an empty store should return None.
get a
---
"a" → None

# Write a couple of keys.
set a=1
set b=2
---
ok

# Reading the value back should return it. An unknown key should return None.
get a
get b
get c
---
"a" → "1"
"b" → "2"
"c" → None

# Replacing a key should return the new value.
set a=foo
get a
---
"a" → "foo"

# Deleting a key should remove it, but not affect other keys.
delete a
get a
get b
---
"a" → None
"b" → "2"

# Deletes are idempotent.
delete a
get a
---
"a" → None

# Writing a deleted key works fine.
set a=1
get a
---
"a" → "1"

# Scan the final state.
scan
---
"a" → "1"
"b" → "2"


================================================
FILE: src/storage/testscripts/engine/scan
================================================
# Tests range scans.

# Write some initial data.
set a=1
set b=2
set ba=21
set bb=22
set c=3
set C=3
---
ok

# Forward and reverse scans.
scan
---
"C" → "3"
"a" → "1"
"b" → "2"
"ba" → "21"
"bb" → "22"
"c" → "3"

scan reverse=true
---
"c" → "3"
"bb" → "22"
"ba" → "21"
"b" → "2"
"a" → "1"
"C" → "3"

# Inclusive and exclusive ranges.
scan b..bb
---
"b" → "2"
"ba" → "21"

scan "b..=bb"
---
"b" → "2"
"ba" → "21"
"bb" → "22"

scan "b..=bb" reverse=true
---
"bb" → "22"
"ba" → "21"
"b" → "2"

# Open ranges.
scan bb..
---
"bb" → "22"
"c" → "3"

scan "..=b"
---
"C" → "3"
"a" → "1"
"b" → "2"


================================================
FILE: src/storage/testscripts/engine/scan_prefix
================================================
# Tests prefix scans.

# Set up an initial dataset of keys with overlapping or adjacent prefixes.
set a=1
set b=2
set ba=21
set bb=22
set "b\xff"=2f
set "b\xff\x00"=2f0
set "b\xffb"=2fb
set "b\xff\xff"=2ff
set c=3
set "\xff"=f
set "\xff\xff"=ff
set "\xff\xff\xff"=fff
set "\xff\xff\xff\xff"=ffff
scan
---
"a" → "1"
"b" → "2"
"ba" → "21"
"bb" → "22"
"b\xff" → "2f"
"b\xff\x00" → "2f0"
"b\xffb" → "2fb"
"b\xff\xff" → "2ff"
"c" → "3"
"\xff" → "f"
"\xff\xff" → "ff"
"\xff\xff\xff" → "fff"
"\xff\xff\xff\xff" → "ffff"

# An empty prefix returns everything.
scan_prefix ""
---
"a" → "1"
"b" → "2"
"ba" → "21"
"bb" → "22"
"b\xff" → "2f"
"b\xff\x00" → "2f0"
"b\xffb" → "2fb"
"b\xff\xff" → "2ff"
"c" → "3"
"\xff" → "f"
"\xff\xff" → "ff"
"\xff\xff\xff" → "fff"
"\xff\xff\xff\xff" → "ffff"

# A missing prefix returns nothing.
scan_prefix bx
---
ok

# Various prefixes under b. In particular, this tests that the bounds generation
# handles 0xff bytes properly.
scan_prefix b
---
"b" → "2"
"ba" → "21"
"bb" → "22"
"b\xff" → "2f"
"b\xff\x00" → "2f0"
"b\xffb" → "2fb"
"b\xff\xff" → "2ff"

scan_prefix bb
---
"bb" → "22"

scan_prefix "b\xff"
---
"b\xff" → "2f"
"b\xff\x00" → "2f0"
"b\xffb" → "2fb"
"b\xff\xff" → "2ff"

scan_prefix "b\xff\x00"
---
"b\xff\x00" → "2f0"

scan_prefix "b\xff\xff"
---
"b\xff\xff" → "2ff"

# Chains of \xff prefixes.
scan_prefix "\xff"
---
"\xff" → "f"
"\xff\xff" → "ff"
"\xff\xff\xff" → "fff"
"\xff\xff\xff\xff" → "ffff"

scan_prefix "\xff\xff"
---
"\xff\xff" → "ff"
"\xff\xff\xff" → "fff"
"\xff\xff\xff\xff" → "ffff"

scan_prefix "\xff\xff\xff"
---
"\xff\xff\xff" → "fff"
"\xff\xff\xff\xff" → "ffff"

scan_prefix "\xff\xff\xff\xff"
---
"\xff\xff\xff\xff" → "ffff"

scan_prefix "\xff\xff\xff\xff\xff"
---
ok


================================================
FILE: src/storage/testscripts/memory/status
================================================
# Tests status for Memory engine.

set foo=123
set bar=1
delete bar
set baz=1
set baz=2
set baz=3
delete qux
---
ok

status
---
Status {
    name: "memory",
    keys: 2,
    size: 10,
    disk_size: 0,
    live_disk_size: 0,
}


================================================
FILE: src/storage/testscripts/mvcc/anomaly_dirty_read
================================================
# A dirty read is when t2 can read an uncommitted value set by t1. Snapshot
# isolation prevents this.

t1: begin
t1: set key=1
---
ok

t2: begin
t2: get key
---
t2: "key" → None


================================================
FILE: src/storage/testscripts/mvcc/anomaly_dirty_write
================================================
# A dirty write is when t2 overwrites an uncommitted value written by t1.
# Snapshot isolation prevents this.

t1: begin
t1: set key=1
---
ok

t2: begin
t2: !set key=2
---
t2: Error: serialization failure, retry transaction


================================================
FILE: src/storage/testscripts/mvcc/anomaly_fuzzy_read
================================================
# A fuzzy (or unrepeatable) read is when t2 sees a value change after t1
# updates it. Snapshot isolation prevents this.

# Set up some initial data.
import key=0
---
ok

t1: begin
t2: begin
---
ok

t2: get key
---
t2: "key" → "0"

t1: set key=1
t1: commit
---
ok

t2: get key
---
t2: "key" → "0"


================================================
FILE: src/storage/testscripts/mvcc/anomaly_lost_update
================================================
# A lost update is when t1 and t2 both read a value and update it, where
# t2's update replaces t1. Snapshot isolation prevents this.

t1: begin
t1: get key
---
t1: "key" → None

t2: begin
t2: get key
---
t2: "key" → None

t1: set key=1
t2: !set key=2
---
t2: Error: serialization failure, retry transaction


================================================
FILE: src/storage/testscripts/mvcc/anomaly_phantom_read
================================================
# A phantom read is when t1 reads entries matching some predicate, but a
# modification by t2 changes which entries match the predicate such that a later
# read by t1 returns them. Snapshot isolation prevents this.
#
# We use a prefix scan as our predicate.

# Write some initial data.
import a=0 ba=0 bb=0
---
ok

t1: begin
t2: begin
---
ok

t1: scan_prefix b
---
t1: "ba" → "0"
t1: "bb" → "0"

t2: delete ba
t2: set bc=2
t2: commit
---
ok

t1: scan_prefix b
---
t1: "ba" → "0"
t1: "bb" → "0"


================================================
FILE: src/storage/testscripts/mvcc/anomaly_read_skew
================================================
# Read skew is when t1 reads a and b, but t2 modifies b in between the
# reads. Snapshot isolation prevents this.

# Set up some initial data.
import a=0 b=0
---
ok

t1: begin
t2: begin
---
ok

t1: get a
---
t1: "a" → "0"

t2: set a=2
t2: set b=2
t2: commit
---
ok

t1: get b
---
t1: "b" → "0"


================================================
FILE: src/storage/testscripts/mvcc/anomaly_write_skew
================================================
# Write skew is when t1 reads a and writes it to b while t2 reads b and writes
# it to a. Snapshot isolation does not prevent this, which is expected, so we
# assert the anomalous behavior. Fixing this would require implementing
# serializable snapshot isolation.

# Write some initial data.
import a=1 b=2
---
ok

t1: begin
t2: begin
---
ok

t1: get a
t2: get b
---
t1: "a" → "1"
t2: "b" → "2"

t1: set b=1
t2: set a=2
---
ok

t1: commit
t2: commit
---
ok

t3: begin readonly
t3: scan
---
t3: "a" → "2"
t3: "b" → "1"


================================================
FILE: src/storage/testscripts/mvcc/bank
================================================
# A simple illustration of MVCC transactions with bank transfers.
#
# We start with three bank accounts A, B, and C, each with a balance of 100.
import A=100 B=100 C=100
---
ok

# Alice wants to transfer 100 from B to A. She begins a transaction and
# checks the balance of all accounts.
alice: begin
alice: scan
---
alice: "A" → "100"
alice: "B" → "100"
alice: "C" → "100"

# She then subtracts 100 from B, and is about to add 100 to A.
alice: set B=0
---
ok

# Bob comes along and wants to transfer 100 from B to C. He begins a transaction
# and checks the balances.
#
# Bob might freak out if there was no money in B and only 200 total in all
# accounts, but Alice hasn't yet committed her change to B so it's not visible.
# If the system were to crash or Alice disconnects, B would still have 100.
bob: begin
bob: scan
---
bob: "A" → "100"
bob: "B" → "100"
bob: "C" → "100"

# Alice now completes the transfer by adding 100 to A and committing to finalize
# the transaction.
alice: set A=200
alice: scan
---
alice: "A" → "200"
alice: "B" → "0"
alice: "C" → "100"

alice: commit
---
ok

# But what about Bob? If he now sets C=200 and B=0, we'll have A=200 B=0 C=200,
# and 100 would have appeared out of thin air! Thankfully, MVCC saves us:
bob: set C=200
---
ok

bob: !set B=0
---
bob: Error: serialization failure, retry transaction

# MVCC caught the conflict, and Bob has to roll back and retry.
bob: rollback
---
ok

# He then finds there's no money left in B anymore, and can't make the transfer.
bob: begin
bob: scan
---
bob: "A" → "200"
bob: "B" → "0"
bob: "C" → "100"


================================================
FILE: src/storage/testscripts/mvcc/begin
================================================
# Begin creates new transactions at increasing versions, with concurrent
# transactions in their active sets.

# Start t1 at v1, with an empty active set. Dump raw engine operations to ensure
# it bumps the next version and registers itself as active.
t1: begin [ops]
t1: state
---
t1: engine set mvcc:NextVersion → 2 ["\x00" → "\x02"]
t1: engine set mvcc:TxnActive(1) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x01" → ""]
t1: v1 rw active={}

# t2 should have v2, and t1 in its active set. It should persist a snapshot of
# its active set.
t2: begin [ops]
t2: state
---
t2: engine set mvcc:NextVersion → 3 ["\x00" → "\x03"]
t2: engine set mvcc:TxnActiveSnapshot(2) → {1} ["\x02\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x01"]
t2: engine set mvcc:TxnActive(2) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x02" → ""]
t2: v2 rw active={1}

# Similarly for t3.
t3: begin [ops]
t3: state
---
t3: engine set mvcc:NextVersion → 4 ["\x00" → "\x04"]
t3: engine set mvcc:TxnActiveSnapshot(3) → {1,2} ["\x02\x00\x00\x00\x00\x00\x00\x00\x03" → "\x02\x01\x02"]
t3: engine set mvcc:TxnActive(3) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x03" → ""]
t3: v3 rw active={1,2}

# Now, commit t2, which unregisters it.
t2: commit [ops]
---
t2: engine delete mvcc:TxnActive(2) ["\x01\x00\x00\x00\x00\x00\x00\x00\x02"]

# It should still be in t3's active set.
t3: state
---
t3: v3 rw active={1,2}

# But not in a new t4.
t4: begin [ops]
t4: state
---
t4: engine set mvcc:NextVersion → 5 ["\x00" → "\x05"]
t4: engine set mvcc:TxnActiveSnapshot(4) → {1,3} ["\x02\x00\x00\x00\x00\x00\x00\x00\x04" → "\x02\x01\x03"]
t4: engine set mvcc:TxnActive(4) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x04" → ""]
t4: v4 rw active={1,3}


================================================
FILE: src/storage/testscripts/mvcc/begin_as_of
================================================
# Begin read-only as-of should provide a view of a historical version.

# Start a concurrent transaction at v1 that should be invisible.
t1: begin
t1: set other=1
---
ok

# Write and commit a key at v2.
t2: begin
t2: set key=2
t2: commit
---
ok

# Write another version at v3, but don't commit it yet.
t3: begin
t3: set key=3
---
ok

dump
---
mvcc:NextVersion → 4 ["\x00" → "\x04"]
mvcc:TxnActive(1) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x01" → ""]
mvcc:TxnActive(3) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x03" → ""]
mvcc:TxnActiveSnapshot(2) → {1} ["\x02\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x01"]
mvcc:TxnActiveSnapshot(3) → {1} ["\x02\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x01"]
mvcc:TxnWrite(1, "other") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x01other\x00\x00" → ""]
mvcc:TxnWrite(3, "key") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03key\x00\x00" → ""]
mvcc:Version("key", 2) → "2" ["\x04key\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x012"]
mvcc:Version("key", 3) → "3" ["\x04key\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x013"]
mvcc:Version("other", 1) → "1" ["\x04other\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x011"]

# Start a read-only transaction as-of version 3. It should only see key=2
# because t1 and t3 haven't committed yet. It shouldn't write any state.
t4: begin readonly as_of=3 [ops]
t4: state
---
t4: v3 ro active={1}

t4: scan
---
t4: "key" → "2"

# Writes should error.
t4: !set foo=bar
t4: !delete foo
---
t4: Error: read-only transaction
t4: Error: read-only transaction

# t1 and t3 commit. Their writes still shouldn't be visible to t4, since
# versions must be stable.
t1: commit
t3: commit
---
ok

t4: scan
---
t4: "key" → "2"

# A new transaction t5 running as-of v3 shouldn't see them either.
t5: begin readonly as_of=3
t5: state
---
t5: v3 ro active={1}

t5: scan
---
t5: "key" → "2"

# Committing and rolling back readonly txns is a noop.
t4: commit [ops]
t5: rollback [ops]
---
ok

# Commit a new value at version 4.
t6: begin
t6: state
t6: set key=4
t6: commit
---
t6: v4 rw active={}

# A snapshot at version 4 should see the old writes, but not those of t6 at v4
# because as_of is at the start of the version.
t7: begin readonly as_of=4
t7: scan
---
t7: "key" → "3"
t7: "other" → "1"

# Running as_of future versions should error, including the next version.
t8: !begin readonly as_of=5
t8: !begin readonly as_of=9
---
t8: Error: invalid input: version 5 does not exist
t8: Error: invalid input: version 9 does not exist

# Version 0 works though, but doesn't show anything.
t8: begin readonly as_of=0
t8: state
t8: scan
---
t8: v0 ro active={}


================================================
FILE: src/storage/testscripts/mvcc/begin_readonly
================================================
# Begin read-only should not create a new version, it should run in the next
# version but using the current active set.

# Start t1 read-only at v1. It shouldn't bump the version nor write any state.
t1: begin readonly [ops]
t1: state
---
t1: v1 ro active={}

# Writes should error.
t1: !set foo=bar
t1: !delete foo
---
t1: Error: read-only transaction
t1: Error: read-only transaction

# Start a new read-write transaction, then another read-only transaction which
# should have it in its active set. t1 should not be in the active set, because
# it's read-only.
t2: begin
t2: state
---
t2: v1 rw active={}

t3: begin readonly [ops]
t3: state
---
t3: v2 ro active={1}

# t2 also shouldn't be in t1's active set. Visibility for t2's writes are
# handled explicitly for t1.
t2: state
---
t2: v1 rw active={}

# Both committing and rolling back read-only transactions are noops.
t1: commit [ops]
t3: commit [ops]
---
ok


================================================
FILE: src/storage/testscripts/mvcc/delete
================================================
# Deletes should work on both existing, missing, and deleted keys.

import 1 a=1 b=1 x=
---
ok

# Delete an existing, missing, and deleted key. Show engine operations.
t1: begin
t1: delete a m x [ops]
---
t1: engine set mvcc:TxnWrite(2, "a") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02a\x00\x00" → ""]
t1: engine set mvcc:Version("a", 2) → None ["\x04a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x00"]
t1: engine set mvcc:TxnWrite(2, "m") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02m\x00\x00" → ""]
t1: engine set mvcc:Version("m", 2) → None ["\x04m\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x00"]
t1: engine set mvcc:TxnWrite(2, "x") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02x\x00\x00" → ""]
t1: engine set mvcc:Version("x", 2) → None ["\x04x\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x00"]

t1: scan
---
t1: "b" → "1"

# Set and then delete a key, both an existing an missing one.
t1: set b=2 c=2 [ops]
---
t1: engine set mvcc:TxnWrite(2, "b") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02b\x00\x00" → ""]
t1: engine set mvcc:Version("b", 2) → "2" ["\x04b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x012"]
t1: engine set mvcc:TxnWrite(2, "c") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02c\x00\x00" → ""]
t1: engine set mvcc:Version("c", 2) → "2" ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x012"]

t1: scan
---
t1: "b" → "2"
t1: "c" → "2"

t1: delete b c [ops]
---
t1: engine set mvcc:TxnWrite(2, "b") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02b\x00\x00" → ""]
t1: engine set mvcc:Version("b", 2) → None ["\x04b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x00"]
t1: engine set mvcc:TxnWrite(2, "c") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02c\x00\x00" → ""]
t1: engine set mvcc:Version("c", 2) → None ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x00"]

t1: scan
---
ok

dump
---
mvcc:NextVersion → 3 ["\x00" → "\x03"]
mvcc:TxnActive(2) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x02" → ""]
mvcc:TxnWrite(2, "a") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02a\x00\x00" → ""]
mvcc:TxnWrite(2, "b") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02b\x00\x00" → ""]
mvcc:TxnWrite(2, "c") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02c\x00\x00" → ""]
mvcc:TxnWrite(2, "m") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02m\x00\x00" → ""]
mvcc:TxnWrite(2, "x") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02x\x00\x00" → ""]
mvcc:Version("a", 1) → "1" ["\x04a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x011"]
mvcc:Version("a", 2) → None ["\x04a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x00"]
mvcc:Version("b", 1) → "1" ["\x04b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x011"]
mvcc:Version("b", 2) → None ["\x04b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x00"]
mvcc:Version("c", 2) → None ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x00"]
mvcc:Version("m", 2) → None ["\x04m\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x00"]
mvcc:Version("x", 1) → None ["\x04x\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x00"]
mvcc:Version("x", 2) → None ["\x04x\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x00"]


================================================
FILE: src/storage/testscripts/mvcc/delete_conflict
================================================
# Delete should return serialization errors both for uncommitted versions
# (past and future), and future committed versions.

t1: begin
t2: begin
t3: begin
t4: begin
---
ok

t1: set a=1
t3: set c=3
t4: set d=4
t4: commit
---
ok

t2: !delete a # past uncommitted
t2: !delete c # future uncommitted
t2: !delete d # future committed
---
t2: Error: serialization failure, retry transaction
t2: Error: serialization failure, retry transaction
t2: Error: serialization failure, retry transaction

dump
---
mvcc:NextVersion → 5 ["\x00" → "\x05"]
mvcc:TxnActive(1) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x01" → ""]
mvcc:TxnActive(2) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x02" → ""]
mvcc:TxnActive(3) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x03" → ""]
mvcc:TxnActiveSnapshot(2) → {1} ["\x02\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x01"]
mvcc:TxnActiveSnapshot(3) → {1,2} ["\x02\x00\x00\x00\x00\x00\x00\x00\x03" → "\x02\x01\x02"]
mvcc:TxnActiveSnapshot(4) → {1,2,3} ["\x02\x00\x00\x00\x00\x00\x00\x00\x04" → "\x03\x01\x02\x03"]
mvcc:TxnWrite(1, "a") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x01a\x00\x00" → ""]
mvcc:TxnWrite(3, "c") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03c\x00\x00" → ""]
mvcc:Version("a", 1) → "1" ["\x04a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x011"]
mvcc:Version("c", 3) → "3" ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x013"]
mvcc:Version("d", 4) → "4" ["\x04d\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x014"]


================================================
FILE: src/storage/testscripts/mvcc/get
================================================
# Get should return the correct latest value.

import 1 key=1 updated=1 deleted=1 tombstone=
import 2 updated=2 deleted=
---
ok

t1: begin readonly
t1: scan
---
t1: "key" → "1"
t1: "updated" → "2"

# Get results should mirror scan.
t1: get key updated deleted tombstone missing
---
t1: "key" → "1"
t1: "updated" → "2"
t1: "deleted" → None
t1: "tombstone" → None
t1: "missing" → None


================================================
FILE: src/storage/testscripts/mvcc/get_isolation
================================================
# Get should be isolated from concurrent transactions.

# Past committed.
t1: begin
t1: set a=1 b=1 d=1 e=1
t1: commit
---
ok

# Past uncommitted.
t2: begin
t2: set a=2 c=2
t2: delete b
---
ok

# Begin the read transaction.
t3: begin readonly
---
ok

# Future committed.
t4: begin
t4: set d=3 f=3
t4: delete e
t4: commit
---
ok

# Future uncommitted.
t5: begin
t5: set d=4 g=4
t5: delete f
---
ok

# Get each key.
t3: get a b c d e f g
---
t3: "a" → "1"
t3: "b" → "1"
t3: "c" → None
t3: "d" → "1"
t3: "e" → "1"
t3: "f" → None
t3: "g" → None


================================================
FILE: src/storage/testscripts/mvcc/resume
================================================
# Resume should resume a transaction with the same state.

# Commit some visible values.
t1: begin
t1: set a=1 b=1
t1: commit
---
ok

# We then start three transactions, of which we will resume t3.  We commit t2
# and t4's changes, which should not be visible, and write a change for t3 which
# should be visible.
t2: begin
t3: begin
t4: begin
---
ok

t2: set a=2
t3: set b=3
t4: set c=4
t2: commit
t4: commit
---
ok

# We now resume t3 as t5.
t3: state
---
t3: v3 rw active={2}

t5: resume '{"version":3, "read_only":false, "active":[2]}'
t5: state
---
t5: v3 rw active={2}

# t5 can see its own changes, but not the others.
t5: scan
---
t5: "a" → "1"
t5: "b" → "3"

# A new transaction should not see t3/5's uncommitted changes.
t6: begin
t6: scan
---
t6: "a" → "2"
t6: "b" → "1"
t6: "c" → "4"

# Once t5 commits, a separate transaction should see its changes.
t5: commit [ops]
---
t5: engine delete mvcc:TxnWrite(3, "b") ["\x03\x00\x00\x00\x00\x00\x00\x00\x03b\x00\x00"]
t5: engine delete mvcc:TxnActive(3) ["\x01\x00\x00\x00\x00\x00\x00\x00\x03"]

t7: begin
t7: scan
---
t7: "a" → "2"
t7: "b" → "3"
t7: "c" → "4"

# Resuming a committed transaction should error.
t8: !resume '{"version":3, "read_only":false, "active":[2]}'
---
t8: Error: invalid input: no active transaction at version 3

# It should also be possible to start a snapshot transaction in t3 and resume
# it. It should not see t3's writes, nor t2's.
t8: begin readonly as_of=3
t8: state
---
t8: v3 ro active={2}

t8: scan
---
t8: "a" → "1"
t8: "b" → "1"

t9: resume '{"version":3, "read_only":true, "active":[2]}'
t9: state
---
t9: v3 ro active={2}

t9: scan
---
t9: "a" → "1"
t9: "b" → "1"


================================================
FILE: src/storage/testscripts/mvcc/rollback
================================================
# Tests that transaction rollback properly rolls back uncommitted writes
# allowing other concurrent transactions to write the keys.

import 1 a=0 b=0 c=0 d=0
---
ok

# t2 will be rolled back. t1 and t3 are concurrent transactions.
t1: begin
t2: begin
t3: begin
---
ok

t1: set a=1
t2: set b=2
t2: delete c
t3: set d=3
---
ok

dump
---
mvcc:NextVersion → 5 ["\x00" → "\x05"]
mvcc:TxnActive(2) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x02" → ""]
mvcc:TxnActive(3) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x03" → ""]
mvcc:TxnActive(4) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x04" → ""]
mvcc:TxnActiveSnapshot(3) → {2} ["\x02\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x02"]
mvcc:TxnActiveSnapshot(4) → {2,3} ["\x02\x00\x00\x00\x00\x00\x00\x00\x04" → "\x02\x02\x03"]
mvcc:TxnWrite(2, "a") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02a\x00\x00" → ""]
mvcc:TxnWrite(3, "b") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03b\x00\x00" → ""]
mvcc:TxnWrite(3, "c") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03c\x00\x00" → ""]
mvcc:TxnWrite(4, "d") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x04d\x00\x00" → ""]
mvcc:Version("a", 1) → "0" ["\x04a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x010"]
mvcc:Version("a", 2) → "1" ["\x04a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x011"]
mvcc:Version("b", 1) → "0" ["\x04b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x010"]
mvcc:Version("b", 3) → "2" ["\x04b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x012"]
mvcc:Version("c", 1) → "0" ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x010"]
mvcc:Version("c", 3) → None ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x00"]
mvcc:Version("d", 1) → "0" ["\x04d\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x010"]
mvcc:Version("d", 4) → "3" ["\x04d\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x013"]

# Both t1 and t3 will conflict with t2.
t1: !set b=1
t3: !set c=3
---
t1: Error: serialization failure, retry transaction
t3: Error: serialization failure, retry transaction

# When t2 is rolled back, none of its writes will be visible, and t1 and t3 can
# perform their writes and successfully commit.
t2: rollback [ops]
---
t2: engine delete mvcc:Version("b", 3) ["\x04b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"]
t2: engine delete mvcc:TxnWrite(3, "b") ["\x03\x00\x00\x00\x00\x00\x00\x00\x03b\x00\x00"]
t2: engine delete mvcc:Version("c", 3) ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"]
t2: engine delete mvcc:TxnWrite(3, "c") ["\x03\x00\x00\x00\x00\x00\x00\x00\x03c\x00\x00"]
t2: engine delete mvcc:TxnActive(3) ["\x01\x00\x00\x00\x00\x00\x00\x00\x03"]

t4: begin readonly
t4: scan
---
t4: "a" → "0"
t4: "b" → "0"
t4: "c" → "0"
t4: "d" → "0"

t1: set b=1
t1: commit
t3: set c=3
t3: commit
---
ok

t5: begin readonly
t5: scan
---
t5: "a" → "1"
t5: "b" → "1"
t5: "c" → "3"
t5: "d" → "3"

dump
---
mvcc:NextVersion → 5 ["\x00" → "\x05"]
mvcc:TxnActiveSnapshot(3) → {2} ["\x02\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x02"]
mvcc:TxnActiveSnapshot(4) → {2,3} ["\x02\x00\x00\x00\x00\x00\x00\x00\x04" → "\x02\x02\x03"]
mvcc:Version("a", 1) → "0" ["\x04a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x010"]
mvcc:Version("a", 2) → "1" ["\x04a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x011"]
mvcc:Version("b", 1) → "0" ["\x04b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x010"]
mvcc:Version("b", 2) → "1" ["\x04b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x011"]
mvcc:Version("c", 1) → "0" ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x010"]
mvcc:Version("c", 4) → "3" ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x013"]
mvcc:Version("d", 1) → "0" ["\x04d\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x010"]
mvcc:Version("d", 4) → "3" ["\x04d\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x013"]


================================================
FILE: src/storage/testscripts/mvcc/scan
================================================
# Scans should use correct key and time bounds. Sets up this dataset:
# 
# T
# 4             x    ba4
# 3   x    a3   b3        x
# 2        x         ba2  bb2  bc2
# 1   B1   a1   x                   c1
#     B    a    b    ba   bb   bc   c

import 1 B=B1 a=a1 b= c=c1
import 2 a= ba=ba2 bb=bb2 bc=bc2
import 3 B= a=a3 b=b3 bb=
import 4 b= ba=ba4
---
ok

# Full scans at all timestamps.
t1: begin readonly as_of=1
t1: scan
---
ok

t2: begin readonly as_of=2
t2: scan
---
t2: "B" → "B1"
t2: "a" → "a1"
t2: "c" → "c1"

t3: begin readonly as_of=3
t3: scan
---
t3: "B" → "B1"
t3: "ba" → "ba2"
t3: "bb" → "bb2"
t3: "bc" → "bc2"
t3: "c" → "c1"

t4: begin readonly as_of=4
t4: scan
---
t4: "a" → "a3"
t4: "b" → "b3"
t4: "ba" → "ba2"
t4: "bc" → "bc2"
t4: "c" → "c1"

t5: begin readonly
t5: scan
---
t5: "a" → "a3"
t5: "ba" → "ba4"
t5: "bc" → "bc2"
t5: "c" → "c1"

# Various bounded scans around ba-bc at version 3.
t3: scan ba..bc
---
t3: "ba" → "ba2"
t3: "bb" → "bb2"

t3: scan "ba..=bc"
---
t3: "ba" → "ba2"
t3: "bb" → "bb2"
t3: "bc" → "bc2"

t3: scan ba..
---
t3: "ba" → "ba2"
t3: "bb" → "bb2"
t3: "bc" → "bc2"
t3: "c" → "c1"

t3: scan "..bc"
---
t3: "B" → "B1"
t3: "ba" → "ba2"
t3: "bb" → "bb2"

t3: scan "..=bc"
---
t3: "B" → "B1"
t3: "ba" → "ba2"
t3: "bb" → "bb2"
t3: "bc" → "bc2"


================================================
FILE: src/storage/testscripts/mvcc/scan_isolation
================================================
# Scan should be isolated from concurrent transactions.

# Past committed.
t1: begin
t1: set a=1 b=1 d=1 e=1
t1: commit
---
ok

# Past uncommitted.
t2: begin
t2: set a=2 c=2
t2: delete b
---
ok

# Begin the read transaction.
t3: begin readonly
---
ok

# Future committed.
t4: begin
t4: set d=3 f=3
t4: delete e
t4: commit
---
ok

# Future uncommitted.
t5: begin
t5: set d=4 g=4
t5: delete f
---
ok

# Scan keys.
t3: scan
---
t3: "a" → "1"
t3: "b" → "1"
t3: "d" → "1"
t3: "e" → "1"


================================================
FILE: src/storage/testscripts/mvcc/scan_key_version_encoding
================================================
# Tests that the key encoding is resistant to key/version overlap.
# For example, a naïve concatenation of keys and versions would
# produce incorrect ordering in this case:
#
# 00|00 00 00 00 00 00 00 01
# 00 00 00 00 00 00 00 00 02|00 00 00 00 00 00 00 02
# 00|00 00 00 00 00 00 00 03

t1: begin
t1: set "\x00"="\x01" [ops]
t1: commit
---
t1: engine set mvcc:TxnWrite(1, "\x00") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\xff\x00\x00" → ""]
t1: engine set mvcc:Version("\x00", 1) → "\x01" ["\x04\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x01\x01"]

t2: begin
t2: set "\x00"="\x02" [ops]
t2: set "\x00\x00\x00\x00\x00\x00\x00\x00\x02"="\x02" [ops]
t2: commit
---
t2: engine set mvcc:TxnWrite(2, "\x00") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x00\xff\x00\x00" → ""]
t2: engine set mvcc:Version("\x00", 2) → "\x02" ["\x04\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x01\x02"]
t2: engine set mvcc:TxnWrite(2, "\x00\x00\x00\x00\x00\x00\x00\x00\x02") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00" → ""]
t2: engine set mvcc:Version("\x00\x00\x00\x00\x00\x00\x00\x00\x02", 2) → "\x02" ["\x04\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x01\x02"]

t3: begin
t3: set "\x00"="\x03" [ops]
t3: commit
---
t3: engine set mvcc:TxnWrite(3, "\x00") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03\x00\xff\x00\x00" → ""]
t3: engine set mvcc:Version("\x00", 3) → "\x03" ["\x04\x00\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x01\x03"]

t4: begin readonly
t4: scan
---
t4: "\x00" → "\x03"
t4: "\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x02"


================================================
FILE: src/storage/testscripts/mvcc/scan_prefix
================================================
# Prefix scans should use correct key and time bounds. Sets up this dataset:
# 
# T
# 4             x    ba4
# 3   x    a3   b3        x
# 2        x         ba2  bb2  bc2
# 1   B1   a1   x                   c1
#     B    a    b    ba   bb   bc   c

import 1 B=B1 a=a1 b= c=c1
import 2 a= ba=ba2 bb=bb2 bc=bc2
import 3 B= a=a3 b=b3 bb=
import 4 b= ba=ba4
---
ok

# Full scans at all timestamps.
t1: begin readonly as_of=1
t1: scan_prefix ""
---
ok

t2: begin readonly as_of=2
t2: scan_prefix ""
---
t2: "B" → "B1"
t2: "a" → "a1"
t2: "c" → "c1"

t3: begin readonly as_of=3
t3: scan_prefix ""
---
t3: "B" → "B1"
t3: "ba" → "ba2"
t3: "bb" → "bb2"
t3: "bc" → "bc2"
t3: "c" → "c1"

t4: begin readonly as_of=4
t4: scan_prefix ""
---
t4: "a" → "a3"
t4: "b" → "b3"
t4: "ba" → "ba2"
t4: "bc" → "bc2"
t4: "c" → "c1"

t5: begin readonly
t5: scan_prefix ""
---
t5: "a" → "a3"
t5: "ba" → "ba4"
t5: "bc" → "bc2"
t5: "c" → "c1"

# Various prefixes at version 3.
t3: scan_prefix B
---
t3: "B" → "B1"

t3: scan_prefix b
---
t3: "ba" → "ba2"
t3: "bb" → "bb2"
t3: "bc" → "bc2"

t3: scan_prefix bb
---
t3: "bb" → "bb2"

t3: scan_prefix bbb
---
ok

# Various prefixes at version 4.
t4: scan_prefix B
---
ok

t4: scan_prefix b
---
t4: "b" → "b3"
t4: "ba" → "ba2"
t4: "bc" → "bc2"

t4: scan_prefix bb
---
ok


================================================
FILE: src/storage/testscripts/mvcc/set
================================================
# Sets should work on both existing, missing, and deleted keys.

import a=1 b=1 x=
---
ok

# Can replace an existing key and tombstone.
t1: begin
t1: set a=2 x=2 [ops]
---
t1: engine set mvcc:TxnWrite(2, "a") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02a\x00\x00" → ""]
t1: engine set mvcc:Version("a", 2) → "2" ["\x04a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x012"]
t1: engine set mvcc:TxnWrite(2, "x") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02x\x00\x00" → ""]
t1: engine set mvcc:Version("x", 2) → "2" ["\x04x\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x012"]

t1: scan
---
t1: "a" → "2"
t1: "b" → "1"
t1: "x" → "2"

# Can write a new key, replace it, and be idempotent.
t1: set c=1 c=2 c=2 [ops]
---
t1: engine set mvcc:TxnWrite(2, "c") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02c\x00\x00" → ""]
t1: engine set mvcc:Version("c", 2) → "1" ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x011"]
t1: engine set mvcc:TxnWrite(2, "c") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02c\x00\x00" → ""]
t1: engine set mvcc:Version("c", 2) → "2" ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x012"]
t1: engine set mvcc:TxnWrite(2, "c") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02c\x00\x00" → ""]
t1: engine set mvcc:Version("c", 2) → "2" ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x012"]

t1: scan
---
t1: "a" → "2"
t1: "b" → "1"
t1: "c" → "2"
t1: "x" → "2"

dump
---
mvcc:NextVersion → 3 ["\x00" → "\x03"]
mvcc:TxnActive(2) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x02" → ""]
mvcc:TxnWrite(2, "a") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02a\x00\x00" → ""]
mvcc:TxnWrite(2, "c") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02c\x00\x00" → ""]
mvcc:TxnWrite(2, "x") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x02x\x00\x00" → ""]
mvcc:Version("a", 1) → "1" ["\x04a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x011"]
mvcc:Version("a", 2) → "2" ["\x04a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x012"]
mvcc:Version("b", 1) → "1" ["\x04b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x011"]
mvcc:Version("c", 2) → "2" ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x012"]
mvcc:Version("x", 1) → None ["\x04x\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x00"]
mvcc:Version("x", 2) → "2" ["\x04x\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x012"]


================================================
FILE: src/storage/testscripts/mvcc/set_conflict
================================================
# Set should return serialization errors both for uncommitted versions
# (past and future), and future committed versions.

t1: begin
t2: begin
t3: begin
t4: begin
---
ok

t1: set a=1
t3: set c=3
t4: set d=4
t4: commit
---
ok

t2: !set a=2 # past uncommitted
t2: !set c=2 # future uncommitted
t2: !set d=2 # future committed
---
t2: Error: serialization failure, retry transaction
t2: Error: serialization failure, retry transaction
t2: Error: serialization failure, retry transaction

dump
---
mvcc:NextVersion → 5 ["\x00" → "\x05"]
mvcc:TxnActive(1) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x01" → ""]
mvcc:TxnActive(2) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x02" → ""]
mvcc:TxnActive(3) → "" ["\x01\x00\x00\x00\x00\x00\x00\x00\x03" → ""]
mvcc:TxnActiveSnapshot(2) → {1} ["\x02\x00\x00\x00\x00\x00\x00\x00\x02" → "\x01\x01"]
mvcc:TxnActiveSnapshot(3) → {1,2} ["\x02\x00\x00\x00\x00\x00\x00\x00\x03" → "\x02\x01\x02"]
mvcc:TxnActiveSnapshot(4) → {1,2,3} ["\x02\x00\x00\x00\x00\x00\x00\x00\x04" → "\x03\x01\x02\x03"]
mvcc:TxnWrite(1, "a") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x01a\x00\x00" → ""]
mvcc:TxnWrite(3, "c") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x03c\x00\x00" → ""]
mvcc:Version("a", 1) → "1" ["\x04a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x011"]
mvcc:Version("c", 3) → "3" ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03" → "\x01\x013"]
mvcc:Version("d", 4) → "4" ["\x04d\x00\x00\x00\x00\x00\x00\x00\x00\x00\x04" → "\x01\x014"]


================================================
FILE: src/storage/testscripts/mvcc/unversioned
================================================
# Tests unversioned keys.

# Getting a missing unversioned key returns None.
get_unversioned a
---
"a" → None

# Setting and getting an unversioned key should work. Dump engine operations.
set_unversioned a=0 [ops]
get_unversioned a
---
engine set mvcc:Unversioned("a") → "0" ["\x05a\x00\x00" → "0"]
"a" → "0"

# Write some versioned keys with the same keys, interleaved between unversioned.
# The raw engine writes show that the internal keys are different.
t1: begin
t1: set a=1 b=1 c=1 [ops]
t1: commit
---
t1: engine set mvcc:TxnWrite(1, "a") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x01a\x00\x00" → ""]
t1: engine set mvcc:Version("a", 1) → "1" ["\x04a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x011"]
t1: engine set mvcc:TxnWrite(1, "b") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x01b\x00\x00" → ""]
t1: engine set mvcc:Version("b", 1) → "1" ["\x04b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x011"]
t1: engine set mvcc:TxnWrite(1, "c") → "" ["\x03\x00\x00\x00\x00\x00\x00\x00\x01c\x00\x00" → ""]
t1: engine set mvcc:Version("c", 1) → "1" ["\x04c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01" → "\x01\x011"]

# Set another unversioned key overlapping a versioned key.
set_unversioned b=0 d=0 [ops]
---
engine set mvcc:Unversioned("b") → "0" ["\x05b\x00\x00" → "0"]
engine set mvcc:Unversioned("d") → "0" ["\x05d\x00\x00" → "0"]

# An MVCC scan shouldn't see the unversioned keys.
t2: begin readonly
t2: scan
---
t2: "a" → "1"
t2: "b" → "1"
t2: "c" → "1"

# Unversioned gets should not see versioned keys.
get_unversioned a b c d
---
"a" → "0"
"b" → "0"
"c" → None
"d" → "0"

# Replacing an unversioned key should work too.
set_unversioned a=2 [ops]
get_unversioned a
---
engine set mvcc:Unversioned("a") → "2" ["\x05a\x00\x00" → "2"]
"a" → "2"


================================================
FILE: tests/scripts/anomalies
================================================
# Tests transaction anomalies. This is also tested at the MVCC and SQL
# levels, but we may as well have an end-to-end test for them.
#
# Uses a single script to avoid cluster startup times for each test.

cluster nodes=5
---
ok

> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
---
ok

# Dirty read: when c2 can read an uncommitted value written by c1. Snapshot
# isolation prevents this.
c1:> BEGIN
c1:> INSERT INTO test VALUES (1, 'a')
---
ok

c2:> BEGIN
c2:> SELECT * FROM test WHERE id = 1
---
ok

c1:> ROLLBACK
c2:> ROLLBACK
---
ok

# Dirty write: when c2 overwrites an uncommitted value written by c1. Snapshot
# isolation prevents this.

c1:> BEGIN
c1:> INSERT INTO test VALUES (1, 'a')
---
ok

c2:> BEGIN
c2:!> INSERT INTO test VALUES (1, 'a')
---
c2: Error: serialization failure, retry transaction

c1:> ROLLBACK
c2:> ROLLBACK
---
ok

# Fuzzy (or unrepeatable) read: when c2 sees a value change after c1 updates it.
# Snapshot isolation prevents this.

> INSERT INTO test VALUES (1, 'a')
---
ok

c1:> BEGIN
c2:> BEGIN
---
ok

c2:> SELECT * FROM test WHERE id = 1
---
c2: 1, 'a'

c1:> UPDATE test SET value = 'b' WHERE id = 1
c1:> COMMIT
c1:> SELECT * FROM test
---
c1: 1, 'b'

c2:> SELECT * FROM test WHERE id = 1
---
c2: 1, 'a'

c2:> ROLLBACK
> DELETE FROM test
---
ok

# Lost update: when c1 and c2 both read a value and update it, where c2's update
# replaces c1. Snapshot isolation prevents this.

c1:> BEGIN
c1:> SELECT * FROM test WHERE id = 1
---
ok

c2:> BEGIN
c2:> SELECT * FROM test WHERE id = 1
---
ok

c1:> INSERT INTO test VALUES (1, 'a')
c1:> COMMIT
---
ok

c2:!> INSERT INTO test VALUES (1, 'a')
---
c2: Error: serialization failure, retry transaction

c2:> ROLLBACK
> DELETE FROM test
---
ok

# Phantom read: when c1 reads entries matching some predicate, but a
# modification by c2 changes which entries match the predicate such that a later
# read by c1 returns them. Snapshot isolation prevents this.

> INSERT INTO test VALUES (1, 'a'), (2, 'b'), (3, 'c')
---
ok

c1:> BEGIN
c2:> BEGIN
---
ok

c1:> SELECT * FROM test WHERE id > 1
---
c1: 2, 'b'
c1: 3, 'c'

c2:> DELETE FROM test WHERE id = 2
c2:> INSERT INTO test VALUES (4, 'd')
c2:> COMMIT
---
ok

c1:> SELECT * FROM test WHERE id > 1
---
c1: 2, 'b'
c1: 3, 'c'

c1:> ROLLBACK
> DELETE FROM test
---
ok

# Read skew: when c1 reads a and b, but c2 modifies b in between the reads.
# Snapshot isolation prevents this.

> INSERT INTO test VALUES (1, 'a'), (2, 'b')
---
ok

c1:> BEGIN
c2:> BEGIN
---
ok

c1:> SELECT * FROM test WHERE id = 1
---
c1: 1, 'a'

c2:> UPDATE test SET value = 'b' WHERE id = 1
c2:> UPDATE test SET value = 'a' WHERE id = 2
c2:> COMMIT
---
ok

c1:> SELECT * FROM test WHERE id = 2
---
c1: 2, 'b'

c1:> ROLLBACK
> DELETE FROM test
---
ok

# Write skew: when c1 reads a and writes it to b while c2 reads b and writes it
# to a. Snapshot isolation does not prevent this, which is expected, so we
# assert the anomalous behavior. Fixing this would require implementing
# serializable snapshot isolation.

> INSERT INTO test VALUES (1, 'a'), (2, 'b')
---
ok

c1:> BEGIN
c2:> BEGIN
---
ok

c1:> SELECT * FROM test WHERE id = 1
c2:> SELECT * FROM test WHERE id = 2
---
c1: 1, 'a'
c2: 2, 'b'

c1:> UPDATE test SET value = 'a' WHERE id = 2
c2:> UPDATE test SET value = 'b' WHERE id = 1
---
ok

c1:> COMMIT
c2:> COMMIT
---
ok

> SELECT * FROM test
---
1, 'b'
2, 'a'


================================================
FILE: tests/scripts/client
================================================
# Tests various client operations.
#
# Uses a single-node cluster for determinism.

cluster nodes=1
---
ok

# Add some tables and data.
> CREATE TABLE countries (id STRING PRIMARY KEY, name STRING NOT NULL)
> INSERT INTO countries VALUES ('fr', 'France'), ('ru', 'Russia'), ('us', 'United States of America')
> CREATE TABLE genres (id INTEGER PRIMARY KEY, name STRING NOT NULL)
> INSERT INTO genres VALUES (1, 'Science Fiction'), (2, 'Action'), (3, 'Comedy')
> CREATE TABLE studios (id INTEGER PRIMARY KEY, name STRING NOT NULL, country_id STRING REFERENCES countries)
> INSERT INTO studios VALUES (1, 'Mosfilm', 'ru'), (2, 'Lionsgate', 'us'), (3, 'StudioCanal', 'fr'), (4, 'Warner Bros', 'us')
> CREATE TABLE movies ( \
    id INTEGER PRIMARY KEY, \
    title STRING NOT NULL, \
    studio_id INTEGER NOT NULL REFERENCES studios, \
    genre_id INTEGER NOT NULL REFERENCES genres, \
    released INTEGER NOT NULL, \
    rating FLOAT, \
    ultrahd BOOLEAN \
)
> INSERT INTO movies VALUES \
    (1, 'Stalker', 1, 1, 1979, 8.2, NULL), \
    (2, 'Sicario', 2, 2, 2015, 7.6, TRUE), \
    (3, 'Primer', 3, 1, 2004, 6.9, NULL), \
    (4, 'Heat', 4, 2, 1995, 8.2, TRUE), \
    (5, 'The Fountain', 4, 1, 2006, 7.2, FALSE), \
    (6, 'Solaris', 1, 1, 1972, 8.1, NULL), \
    (7, 'Gravity', 4, 1, 2013, 7.7, TRUE), \
    (8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE), \
    (9, 'Birdman', 4, 3, 2014, 7.7, TRUE), \
    (10, 'Inception', 4, 1, 2010, 8.8, TRUE)
---
ok

# List the tables and display table schemas. Error on missing table.
tables
---
countries
genres
movies
studios

table movies
---
CREATE TABLE movies (
  id INTEGER PRIMARY KEY,
  title STRING NOT NULL,
  studio_id INTEGER NOT NULL INDEX REFERENCES studios,
  genre_id INTEGER NOT NULL INDEX REFERENCES genres,
  released INTEGER NOT NULL,
  rating FLOAT DEFAULT NULL,
  ultrahd BOOLEAN DEFAULT NULL
)

table movies raw=true
---
Table {
    name: "movies",
    primary_key: 0,
    columns: [
        Column {
            name: "id",
            datatype: Integer,
            nullable: false,
            default: None,
            unique: true,
            index: false,
            references: None,
        },
        Column {
            name: "title",
            datatype: String,
            nullable: false,
            default: None,
            unique: false,
            index: false,
            references: None,
        },
        Column {
            name: "studio_id",
            datatype: Integer,
            nullable: false,
            default: None,
            unique: false,
            index: true,
            references: Some(
                "studios",
            ),
        },
        Column {
            name: "genre_id",
            datatype: Integer,
            nullable: false,
            default: None,
            unique: false,
            index: true,
            references: Some(
                "genres",
            ),
        },
        Column {
            name: "released",
            datatype: Integer,
            nullable: false,
            default: None,
            unique: false,
            index: false,
            references: None,
        },
        Column {
            name: "rating",
            datatype: Float,
            nullable: true,
            default: Some(
                Null,
            ),
            unique: false,
            index: false,
            references: None,
        },
        Column {
            name: "ultrahd",
            datatype: Boolean,
            nullable: true,
            default: Some(
                Null,
            ),
            unique: false,
            index: false,
            references: None,
        },
    ],
}

table countries
table genres
table studios
---
CREATE TABLE countries (
  id STRING PRIMARY KEY,
  name STRING NOT NULL
)
CREATE TABLE genres (
  id INTEGER PRIMARY KEY,
  name STRING NOT NULL
)
CREATE TABLE studios (
  id INTEGER PRIMARY KEY,
  name STRING NOT NULL,
  country_id STRING DEFAULT NULL INDEX REFERENCES countries
)

!table missing
---
Error: invalid input: table missing does not exist

# Fetch server status.
status
---
Status {
    server: 1,
    raft: Status {
        leader: 1,
        term: 1,
        match_index: {
            1: 25,
        },
        commit_index: 25,
        applied_index: 25,
        storage: Status {
            name: "bitcask",
            keys: 27,
            size: 1169,
            disk_size: 1649,
            live_disk_size: 1385,
        },
    },
    mvcc: Status {
        versions: 8,
        active_txns: 0,
        storage: Status {
            name: "bitcask",
            keys: 36,
            size: 2177,
            disk_size: 8259,
            live_disk_size: 2465,
        },
    },
}


================================================
FILE: tests/scripts/errors
================================================
# Tests various error handling.

cluster nodes=5
---
ok

# A transaction can continue and commit after encountering an error.
> BEGIN
> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
> INSERT INTO test VALUES (1, 'a')
!> INSERT INTO test VALUES (NULL, 'b')
> INSERT INTO test VALUES (2, 'b')
> COMMIT
---
Error: invalid input: invalid primary key NULL

> SELECT * FROM test
---
1, 'a'
2, 'b'

# Closing/disconnecting a client rolls back an open transaction.
c1:> BEGIN
c1:> INSERT INTO test VALUES (3, 'c')
---
ok

c2:!> INSERT INTO test VALUES (3, 'c')
---
c2: Error: serialization failure, retry transaction

c1:> close
---
ok

c2:> INSERT INTO test VALUES (3, 'c')
c2:> SELECT * FROM test
---
c2: 1, 'a'
c2: 2, 'b'
c2: 3, 'c'


================================================
FILE: tests/scripts/isolation
================================================
# Tests transaction isolation.
#
# Transactions are tested more thoroughly in the MVCC tests, this just does some
# basic SQL-level testing.
#
# Sets up a sequence of transactions that each perform a write, and checks
# what they can see.
#
# c1: past, committed before c4 began
# c2: past, commits after c4 began
# c3: past, uncommitted
# c4: test transaction
# c5: future, committed
# c6: future, uncommitted
# c7: future, AS OF version 4

cluster nodes=5
---
ok

# c1: past, committed before c4 began
c1:> BEGIN
c1:> CREATE TABLE test (id INT PRIMARY KEY, value STRING)
c1:> INSERT INTO test VALUES (1, 'a')
c1:> COMMIT
---
ok

# c2: past, commits after c4 began
c2:> BEGIN
c2:> INSERT INTO test VALUES (2, 'b')
---
ok

# c3: past, uncommitted
c3:> BEGIN
c3:> INSERT INTO test VALUES (3, 'c')
---
ok

# c4: test transaction
c4:[result]> BEGIN
c4:> INSERT INTO test VALUES (4, 'd')
---
c4: Begin(TransactionState { version: 4, read_only: false, active: {2, 3} })

# Commit c2.
c2:> COMMIT
---
ok

# c5: future, committed
c5:> BEGIN
c5:> INSERT INTO test VALUES (5, 'e')
c5:> COMMIT
---
ok

# c6: future, uncommitted
c6:> BEGIN
c6:> INSERT INTO test VALUES (6, 'f')
---
ok

# When c4 scans, it should only see the write of c1 and itself.
c4:> SELECT * FROM test
---
c4: 1, 'a'
c4: 4, 'd'

# An AS OF transaction in version 4 should not see c4's uncomitted write.
c7:> BEGIN READ ONLY AS OF SYSTEM TIME 4
c7:> SELECT * FROM test
c7:> ROLLBACK
---
c7: 1, 'a'

# c4 can commit.
c4:> COMMIT
---
ok

# An implicit transaction should see c1, c2, c4, c5:
> SELECT * FROM test
---
1, 'a'
2, 'b'
4, 'd'
5, 'e'

# An AS OF transaction in version 4 should not see c4's write even after it
# has committed, such that it's consistent with the previous AS OF 4. The
# snapshot is taken out at the start of the version.
c7:> BEGIN READ ONLY AS OF SYSTEM TIME 4
c7:> SELECT * FROM test
c7:> ROLLBACK
---
c7: 1, 'a'


================================================
FILE: tests/scripts/queries
================================================
# Tests some basic queries. This is more thorougly tested in the SQL tests, this
# just tries a few basic things.

cluster nodes=5
---
ok

# Add a movie dataset.
> CREATE TABLE countries (id STRING PRIMARY KEY, name STRING NOT NULL)
> INSERT INTO countries VALUES ('fr', 'France'), ('ru', 'Russia'), ('us', 'United States of America')
> CREATE TABLE genres (id INTEGER PRIMARY KEY, name STRING NOT NULL)
> INSERT INTO genres VALUES (1, 'Science Fiction'), (2, 'Action'), (3, 'Comedy')
> CREATE TABLE studios (id INTEGER PRIMARY KEY, name STRING NOT NULL, country_id STRING REFERENCES countries)
> INSERT INTO studios VALUES (1, 'Mosfilm', 'ru'), (2, 'Lionsgate', 'us'), (3, 'StudioCanal', 'fr'), (4, 'Warner Bros', 'us')
> CREATE TABLE movies ( \
    id INTEGER PRIMARY KEY, \
    title STRING NOT NULL, \
    studio_id INTEGER NOT NULL REFERENCES studios, \
    genre_id INTEGER NOT NULL REFERENCES genres, \
    released INTEGER NOT NULL, \
    rating FLOAT, \
    ultrahd BOOLEAN \
)
> INSERT INTO movies VALUES \
    (1, 'Stalker', 1, 1, 1979, 8.2, NULL), \
    (2, 'Sicario', 2, 2, 2015, 7.6, TRUE), \
    (3, 'Primer', 3, 1, 2004, 6.9, NULL), \
    (4, 'Heat', 4, 2, 1995, 8.2, TRUE), \
    (5, 'The Fountain', 4, 1, 2006, 7.2, FALSE), \
    (6, 'Solaris', 1, 1, 1972, 8.1, NULL), \
    (7, 'Gravity', 4, 1, 2013, 7.7, TRUE), \
    (8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE), \
    (9, 'Birdman', 4, 3, 2014, 7.7, TRUE), \
    (10, 'Inception', 4, 1, 2010, 8.8, TRUE)
---
ok

# Full table scan and point queries. With/without column headers.
[header]> SELECT * FROM movies
---
movies.id, movies.title, movies.studio_id, movies.genre_id, movies.released, movies.rating, movies.ultrahd
1, 'Stalker', 1, 1, 1979, 8.2, NULL
2, 'Sicario', 2, 2, 2015, 7.6, TRUE
3, 'Primer', 3, 1, 2004, 6.9, NULL
4, 'Heat', 4, 2, 1995, 8.2, TRUE
5, 'The Fountain', 4, 1, 2006, 7.2, FALSE
6, 'Solaris', 1, 1, 1972, 8.1, NULL
7, 'Gravity', 4, 1, 2013, 7.7, TRUE
8, 'Blindspotting', 2, 3, 2018, 7.4, TRUE
9, 'Birdman', 4, 3, 2014, 7.7, TRUE
10, 'Inception', 4, 1, 2010, 8.8, TRUE

> SELECT * FROM genres WHERE id = 2
---
2, 'Action'

# Aggregate query.
[header]> SELECT s.name AS studio, COUNT(*) AS movies, AVG(m.rating) AS rating \
    FROM movies m JOIN studios s ON m.studio_id = s.id \
    GROUP BY s.name ORDER BY rating DESC
---
studio, movies, rating
'Mosfilm', 2, 8.149999999999999
'Warner Bros', 5, 7.919999999999999
'Lionsgate', 2, 7.5
'StudioCanal', 1, 6.9

# Try a complex multi-way join with multiple joins of the same table. Uses GROUP
# BY to discard duplicates from the cross join. The query finds all movies
# belonging to a studio that's released at least one movies rated 8 or higher.
> SELECT m.id, m.title, g.name AS genre, s.name AS studio, m.rating \
  FROM movies m JOIN genres g ON m.genre_id = g.id, \
    studios s JOIN movies good ON good.studio_id = s.id AND good.rating >= 8 \
  WHERE m.studio_id = s.id \
  GROUP BY m.id, m.title, g.name, s.name, m.rating, m.released \
  ORDER BY m.rating DESC, m.released ASC, m.id ASC
---
10, 'Inception', 'Science Fiction', 'Warner Bros', 8.8
1, 'Stalker', 'Science Fiction', 'Mosfilm', 8.2
4, 'Heat', 'Action', 'Warner Bros', 8.2
6, 'Solaris', 'Science Fiction', 'Mosfilm', 8.1
7, 'Gravity', 'Science Fiction', 'Warner Bros', 7.7
9, 'Birdman', 'Comedy', 'Warner Bros', 7.7
5, 'The Fountain', 'Science Fiction', 'Warner Bros', 7.2

# Explain that query.
> EXPLAIN SELECT m.id, m.title, g.name AS genre, s.name AS studio, m.rating \
  FROM movies m JOIN genres g ON m.genre_id = g.id, \
    studios s JOIN movies good ON good.studio_id = s.id AND good.rating >= 8 \
  WHERE m.studio_id = s.id \
  GROUP BY m.id, m.title, g.name, s.name, m.rating, m.released \
  ORDER BY m.rating DESC, m.released ASC, m.id ASC
---
Remap: m.id, m.title, genre, studio, m.rating (dropped: m.released)
└─ Order: m.rating desc, m.released asc, m.id asc
   └─ Projection: m.id, m.title, g.name as genre, s.name as studio, m.rating, m.released
      └─ Aggregate: m.id, m.title, g.name, s.name, m.rating, m.released
         └─ HashJoin: inner on m.studio_id = s.id
            ├─ HashJoin: inner on m.genre_id = g.id
            │  ├─ Scan: movies as m
            │  └─ Scan: genres as g
            └─ HashJoin: inner on s.id = good.studio_id
               ├─ Scan: studios as s
               └─ Scan: movies as good (good.rating > 8 OR good.rating = 8)


================================================
FILE: tests/testcluster.rs
================================================
use std::collections::BTreeMap;
use std::error::Error;
use std::fmt::Write as _;
use std::path::Path;
use std::time::Duration;

use rand::RngExt as _;

use toydb::Client;
use toydb::raft::NodeID;

/// Timeout for node readiness.
const TIMEOUT: Duration = Duration::from_secs(5);

/// The base SQL port (+id).
const SQL_BASE_PORT: u16 = 19600;

/// The base Raft port (+id).
const RAFT_BASE_PORT: u16 = 19700;

/// Runs a toyDB cluster using the built binary in a temporary directory. The
/// cluster will be killed and removed when dropped.
///
/// This runs the cluster as child processes using the built binary instead of
/// spawning in-memory threads for a couple of reasons: it avoids having to
/// gracefully shut down the server (which is complicated by e.g.
/// TcpListener::accept() not being interruptable), and it tests the entire
/// server (and eventually the toySQL client) end-to-end.
pub struct TestCluster {
    servers: BTreeMap<NodeID, TestServer>,
    #[allow(dead_code)]
    dir: tempfile::TempDir, // deleted when dropped
}

type NodePorts = BTreeMap<NodeID, (u16, u16)>; // raft,sql on localhost

impl TestCluster {
    /// Runs and returns a test cluster. It keeps running until dropped.
    pub fn run(nodes: u8) -> Result<Self, Box<dyn Error>> {
        // Create temporary directory.
        let dir = tempfile::TempDir::with_prefix("toydb")?;

        // Allocate port numbers for nodes.
        let ports: NodePorts = (1..=nodes)
            .map(|id| (id, (RAFT_BASE_PORT + id as u16, SQL_BASE_PORT + id as u16)))
            .collect();

        // Start nodes.
        let mut servers = BTreeMap::new();
        for id in 1..=nodes {
            let dir = dir.path().join(format!("toydb{id}"));
            servers.insert(id, TestServer::run(id, &dir, &ports)?);
        }

        // Wait for the nodes to be ready, by fetching the server status.
        let started = std::time::Instant::now();
        for server in servers.values_mut() {
            while let Err(error) = server.connect().and_then(|mut c| Ok(c.status()?)) {
                server.assert_alive();
                if started.elapsed() >= TIMEOUT {
                    return Err(error);
                }
                std::thread::sleep(Duration::from_millis(200));
            }
        }

        Ok(Self { servers, dir })
    }

    /// Connects to a random cluster node using a Rust client. Testing with
    /// toysql is too annoying, since we have to deal with rustyline, PTYs,
    /// echoing, multiline editing, etc.
    pub fn connect(&self) -> Result<Client, Box<dyn Error>> {
        let id = rand::rng().random_range(1..=self.servers.len()) as NodeID;
        self.servers.get(&id).unwrap().connect()
    }
}

/// A toyDB server.
pub struct TestServer {
    id: NodeID,
    child: std::process::Child,
    sql_port: u16,
}

impl TestServer {
    /// Runs a toyDB server.
    fn run(id: NodeID, dir: &Path, ports: &NodePorts) -> Result<Self, Box<dyn Error>> {
        // Build and write the configuration file.
        let configfile = dir.join("toydb.yaml");
        std::fs::create_dir_all(dir)?;
        std::fs::write(&configfile, Self::build_config(id, dir, ports)?)?;

        // Build the binary.
        //
        // TODO: this may contribute to slow tests, consider building once.
        let build = escargot::CargoBuild::new().bin("toydb").run()?;

        // Spawn process. Discard output.
        let child = build
            .command()
            .args(["-c", &configfile.to_string_lossy()])
            .stdout(std::process::Stdio::null())
            .stderr(std::process::Stdio::null())
            .spawn()?;

        let (_, sql_port) = ports.get(&id).copied().expect("node not in ports");
        Ok(Self { id, child, sql_port })
    }

    /// Generates a config file for the given node.
    fn build_config(id: NodeID, dir: &Path, ports: &NodePorts) -> Result<String, Box<dyn Error>> {
        let (raft_port, sql_port) = ports.get(&id).expect("node not in ports");
        let mut cfg = String::new();
        writeln!(cfg, "id: {id}")?;
        writeln!(cfg, "data_dir: {}", dir.to_string_lossy())?;
        writeln!(cfg, "listen_raft: localhost:{raft_port}")?;
        writeln!(cfg, "listen_sql: localhost:{sql_port}")?;
        write!(cfg, "peers: {{")?;
        if ports.len() > 1 {
            writeln!(cfg)?;
        }
        for (peer_id, (peer_raft_port, _)) in ports.iter().filter(|(peer, _)| **peer != id) {
            write!(cfg, "  '{peer_id}': localhost:{peer_raft_port},")?;
        }
        writeln!(cfg, "}}")?;
        Ok(cfg)
    }

    /// Asserts that the server is still running.
    fn assert_alive(&mut self) {
        if let Some(status) = self.child.try_wait().expect("failed to check exit status") {
            panic!("node {id} exited with {status}", id = self.id)
        }
    }

    /// Connects to the server using a regular client.
    fn connect(&self) -> Result<Client, Box<dyn Error>> {
        Ok(Client::connect(("localhost", self.sql_port))?)
    }
}

impl Drop for TestServer {
    // Kills the child process when dropped.
    fn drop(&mut self) {
        self.child.kill().expect("failed to kill node");
        self.child.wait().expect("failed to wait for node to terminate");
    }
}


================================================
FILE: tests/tests.rs
================================================
//! A basic set of end-to-end tests as Goldenscripts under tests/scripts/. These
//! spin up actual clusters using the built binary and run operations against
//! them from multiple clients.
//!
//! There are more comprehensive tests elsewhere in the codebase, see the various
//! src/*/testscript scripts.

#![warn(clippy::all)]

mod testcluster;

use std::collections::HashMap;
use std::error::Error;
use std::fmt::Write as _;
use std::path::Path;
use std::sync::{LazyLock, Mutex};

use itertools::Itertools as _;
use test_each_file::test_each_path;

use testcluster::TestCluster;
use toydb::{Client, StatementResult};

// Run goldenscript tests in tests/scripts.
test_each_path! { in "tests/scripts" => test_goldenscript }

fn test_goldenscript(path: &Path) {
    // We can't run tests concurrently, because the test clusters end up using
    // the same ports. We also don't want to run a bunch of them concurrently.
    // We can't use the #[serial_test] macro either, since it doesn't work with
    // test_each_path. Just use a mutex to serialize them.
    static MUTEX: LazyLock<Mutex<()>> = LazyLock::new(Mutex::default);
    let _guard = MUTEX.lock().ok(); // ignore poisoning

    goldenscript::run(&mut Runner::new(), path).expect("goldenscript failed")
}

/// Runs Raft goldenscript tests. See run() for available commands.
#[derive(Default)]
struct Runner {
    cluster: Option<TestCluster>,
    clients: HashMap<String, Client>,
}

impl Runner {
    fn new() -> Self {
        Self::default()
    }

    /// Fetches a client for the given prefix, or creates a new one.
    fn get_client(&mut self, prefix: &Option<String>) -> Result<&mut Client, Box<dyn Error>> {
        let name = Self::client_name(prefix);
        if !self.clients.contains_key(name) {
            let Some(cluster) = self.cluster.as_mut() else {
                return Err("no cluster".into());
            };
            let client = cluster.connect()?;
            self.clients.insert(name.to_string(), client);
        }
        Ok(self.clients.get_mut(name).expect("no client"))
    }

    /// Returns a client name for a prefix.
    fn client_name(prefix: &Option<String>) -> &str {
        prefix.as_deref().unwrap_or_default()
    }
}

impl goldenscript::Runner for Runner {
    /// Runs a goldenscript command.
    fn run(&mut self, command: &goldenscript::Command) -> Result<String, Box<dyn Error>> {
        let mut output = String::new();
        let mut tags = command.tags.clone();

        // Handle simple, non-SQL commands.
        match command.name.as_str() {
            // close
            "close" => {
                command.consume_args().reject_rest()?;
                let name = Self::client_name(&command.prefix);
                if self.clients.remove(name).is_none() {
                    return Err("no client to close".into());
                }
                return Ok(output);
            }

            // cluster nodes=N
            "cluster" => {
                let mut args = command.consume_args();
                let nodes = args.lookup_parse("nodes")?.unwrap_or(0);
                args.reject_rest()?;
                if self.cluster.is_some() {
                    return Err("cluster already exists".into());
                }
                self.cluster = Some(TestCluster::run(nodes)?);
                return Ok(output);
            }

            // status
            "status" => {
                command.consume_args().reject_rest()?;
                let status = self.get_client(&command.prefix)?.status()?;
                write!(output, "{status:#?}")?;
                return Ok(output);
            }

            // table [TABLE]
            "table" => {
                let mut args = command.consume_args();
                let name = &args.next_pos().ok_or("table not given")?.value;
                let raw = args.lookup_parse("raw")?.unwrap_or(false);
                args.reject_rest()?;
                let table = self.get_client(&command.prefix)?.get_table(name)?;
                if raw {
                    write!(output, "{table:#?}")?;
                } else {
                    write!(output, "{table}")?;
                }
                return Ok(output);
            }

            // tables
            "tables" => {
                command.consume_args().reject_rest()?;
                let tables = self.get_client(&command.prefix)?.list_tables()?;
                for table in tables {
                    writeln!(output, "{table}")?;
                }
                return Ok(output);
            }

            _ => {}
        }

        // Otherwise, interpret the entire command as a SQL statement.
        if !command.args.is_empty() {
            return Err("statements should be given as a command with no args".into());
        }
        let client = self.get_client(&command.prefix)?;
        let input = &command.name;

        // Execute the command and display the result if requested.
        // SELECT and EXPLAIN results are always output.
        let result = client.execute(input)?;

        match result {
            StatementResult::Select { columns, rows } => {
                if tags.remove("header") {
                    writeln!(output, "{}", columns.into_iter().join(", "))?;
                }
                for row in rows {
                    writeln!(output, "{}", row.into_iter().join(", "))?;
                }
            }
            StatementResult::Explain(root) => writeln!(output, "{root}")?,
            result if tags.remove("result") => writeln!(output, "{result:?}")?,
            _ => {}
        }

        if let Some(tag) = tags.iter().next() {
            return Err(format!("invalid tag {tag}").into());
        }

        Ok(output)
    }
}